library(tidyverse) # the usual stuff: dplyr, readr, and other goodies
library(lubridate) # to handle dates
library(GGally) # for correlation-scatter plot matrix
library(ggfortify) # to produce residual diagnostic plots
library(rsample) # to split dataframe in training- & testing sets
library(janitor) # clean_names()
library(broom) # use broom:augment() to get tidy table with regression output, residuals, etc
library(huxtable) # to get summary table of all models produced
library(kableExtra) # for formatting tables
library(moderndive) # for getting regression tables
library(skimr) # for skim
library(mosaic)
library(leaflet) # for interactive HTML maps
library(tidytext)
library(viridis)
library(vroom)
library(lmtest)
library(sandwich)
library(ggbeeswarm)
library(scales)
library(ggcorrplot)
library(ggthemes)# use cache=TRUE so you dont donwload the data everytime you knit
listings <- vroom("http://data.insideairbnb.com/china/hk/hong-kong/2021-09-24/data/listings.csv.gz") %>%
clean_names()# Obtain an overview of the raw values
dplyr::glimpse(listings)Rows: 6,046
Columns: 74
$ id <dbl> 17891, 69074, 103760, 104~
$ listing_url <chr> "https://www.airbnb.com/r~
$ scrape_id <dbl> 2.021092e+13, 2.021092e+1~
$ last_scraped <date> 2021-09-25, 2021-09-25, ~
$ name <chr> "Large Light Filled Loft"~
$ description <chr> "Gorgeous and spacious lo~
$ neighborhood_overview <chr> "Best neighborhood in Hon~
$ picture_url <chr> "https://a0.muscache.com/~
$ host_id <dbl> 69063, 160139, 304876, 54~
$ host_url <chr> "https://www.airbnb.com/u~
$ host_name <chr> "Candace", "Amy", "Brend"~
$ host_since <date> 2010-01-09, 2010-07-07, ~
$ host_location <chr> "Los Angeles, California,~
$ host_about <chr> "Hi, my name is Candace C~
$ host_response_time <chr> "within a day", "within a~
$ host_response_rate <chr> "100%", "100%", "100%", "~
$ host_acceptance_rate <chr> "0%", "75%", "89%", "N/A"~
$ host_is_superhost <lgl> FALSE, FALSE, FALSE, FALS~
$ host_thumbnail_url <chr> "https://a0.muscache.com/~
$ host_picture_url <chr> "https://a0.muscache.com/~
$ host_neighbourhood <chr> "Sheung Wan", "Sheung Wan~
$ host_listings_count <dbl> 1, 2, 9, 0, 9, 9, 9, 1, 1~
$ host_total_listings_count <dbl> 1, 2, 9, 0, 9, 9, 9, 1, 1~
$ host_verifications <chr> "['email', 'phone', 'revi~
$ host_has_profile_pic <lgl> TRUE, TRUE, TRUE, TRUE, T~
$ host_identity_verified <lgl> TRUE, TRUE, TRUE, TRUE, T~
$ neighbourhood <chr> "Hong Kong Island, Hong K~
$ neighbourhood_cleansed <chr> "Central & Western", "Cen~
$ neighbourhood_group_cleansed <lgl> NA, NA, NA, NA, NA, NA, N~
$ latitude <dbl> 22.28327, 22.28350, 22.28~
$ longitude <dbl> 114.1499, 114.1485, 114.1~
$ property_type <chr> "Entire rental unit", "En~
$ room_type <chr> "Entire home/apt", "Entir~
$ accommodates <dbl> 3, 3, 6, 2, 6, 6, 6, 2, 4~
$ bathrooms <lgl> NA, NA, NA, NA, NA, NA, N~
$ bathrooms_text <chr> "1 bath", "1 bath", "1 ba~
$ bedrooms <dbl> NA, 1, 2, 1, 2, 2, 2, 1, ~
$ beds <dbl> 1, 2, 3, 1, 3, 3, 3, 1, 3~
$ amenities <chr> "[\"Washer\", \"Air condi~
$ price <chr> "$1,400.00", "$1,429.00",~
$ minimum_nights <dbl> 2, 2, 2, 1, 2, 2, 2, 1, 7~
$ maximum_nights <dbl> 365, 365, 365, 365, 365, ~
$ minimum_minimum_nights <dbl> 2, 2, 2, 1, 2, 2, 2, 1, 7~
$ maximum_minimum_nights <dbl> 2, 2, 2, 1, 2, 2, 2, 2, 7~
$ minimum_maximum_nights <dbl> 365, 365, 365, 365, 365, ~
$ maximum_maximum_nights <dbl> 365, 365, 365, 365, 365, ~
$ minimum_nights_avg_ntm <dbl> 2.0, 2.0, 2.0, 1.0, 2.0, ~
$ maximum_nights_avg_ntm <dbl> 365.0, 365.0, 365.0, 365.~
$ calendar_updated <lgl> NA, NA, NA, NA, NA, NA, N~
$ has_availability <lgl> TRUE, TRUE, TRUE, TRUE, T~
$ availability_30 <dbl> 13, 0, 27, 30, 0, 0, 26, ~
$ availability_60 <dbl> 43, 0, 57, 60, 0, 0, 56, ~
$ availability_90 <dbl> 73, 13, 87, 90, 0, 23, 86~
$ availability_365 <dbl> 318, 103, 252, 365, 116, ~
$ calendar_last_scraped <date> 2021-09-25, 2021-09-25, ~
$ number_of_reviews <dbl> 73, 135, 274, 14, 209, 22~
$ number_of_reviews_ltm <dbl> 0, 1, 2, 0, 4, 2, 3, 2, 0~
$ number_of_reviews_l30d <dbl> 0, 0, 2, 0, 0, 0, 1, 0, 0~
$ first_review <date> 2016-03-10, 2012-06-29, ~
$ last_review <date> 2017-11-29, 2019-08-03, ~
$ review_scores_rating <dbl> 4.76, 4.84, 4.44, 4.67, 4~
$ review_scores_accuracy <dbl> 4.73, 4.81, 4.39, 4.50, 4~
$ review_scores_cleanliness <dbl> 4.51, 4.77, 4.44, 4.86, 4~
$ review_scores_checkin <dbl> 4.92, 4.87, 4.46, 4.86, 4~
$ review_scores_communication <dbl> 4.93, 4.91, 4.60, 4.93, 4~
$ review_scores_location <dbl> 4.90, 4.90, 4.72, 4.79, 4~
$ review_scores_value <dbl> 4.66, 4.69, 4.40, 4.71, 4~
$ license <lgl> NA, NA, NA, NA, NA, NA, N~
$ instant_bookable <lgl> FALSE, FALSE, FALSE, FALS~
$ calculated_host_listings_count <dbl> 1, 1, 9, 1, 9, 9, 9, 1, 1~
$ calculated_host_listings_count_entire_homes <dbl> 1, 1, 5, 1, 5, 5, 5, 1, 1~
$ calculated_host_listings_count_private_rooms <dbl> 0, 0, 4, 0, 4, 4, 4, 0, 0~
$ calculated_host_listings_count_shared_rooms <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0~
$ reviews_per_month <dbl> 1.08, 1.20, 3.08, 0.12, 2~
# Transform price, host_response_rate and host_acceptance_rate into numeric variables for summary statistics
listings <- listings %>%
mutate(price = parse_number(price),
host_response_rate=parse_number(host_response_rate),
host_acceptance_rate=parse_number(host_acceptance_rate))
# Check whether transformation has been successful
typeof(c(listings$price, listings$host_response_rate, listings$host_acceptance_rate))[1] "double"
# Reduce dataset to exclude variables that are either irrelevant or too difficult to analyse
reduced_listings <- listings %>%
select(-c(id, listing_url, scrape_id, last_scraped, name, description, neighborhood_overview, picture_url, host_id, host_url, host_name, host_about, host_thumbnail_url, host_picture_url, host_verifications, minimum_minimum_nights, maximum_minimum_nights, minimum_maximum_nights, maximum_maximum_nights, calendar_updated, calendar_last_scraped, first_review, last_review))# See summary statistics (excl. minimum values for numeric variables)
skim(reduced_listings)| Name | reduced_listings |
| Number of rows | 6046 |
| Number of columns | 51 |
| _______________________ | |
| Column type frequency: | |
| character | 9 |
| Date | 1 |
| logical | 8 |
| numeric | 33 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| host_location | 12 | 1.00 | 2 | 89 | 0 | 183 | 0 |
| host_response_time | 1 | 1.00 | 3 | 18 | 0 | 5 | 0 |
| host_neighbourhood | 824 | 0.86 | 2 | 33 | 0 | 144 | 0 |
| neighbourhood | 3486 | 0.42 | 9 | 70 | 0 | 178 | 0 |
| neighbourhood_cleansed | 0 | 1.00 | 5 | 17 | 0 | 18 | 0 |
| property_type | 0 | 1.00 | 3 | 35 | 0 | 70 | 0 |
| room_type | 0 | 1.00 | 10 | 15 | 0 | 4 | 0 |
| bathrooms_text | 22 | 1.00 | 6 | 17 | 0 | 35 | 0 |
| amenities | 0 | 1.00 | 2 | 1327 | 0 | 3844 | 0 |
Variable type: Date
| skim_variable | n_missing | complete_rate | min | max | median | n_unique |
|---|---|---|---|---|---|---|
| host_since | 1 | 1 | 2010-01-09 | 2021-09-19 | 2016-04-18 | 1268 |
Variable type: logical
| skim_variable | n_missing | complete_rate | mean | count |
|---|---|---|---|---|
| host_is_superhost | 1 | 1 | 0.22 | FAL: 4718, TRU: 1327 |
| host_has_profile_pic | 1 | 1 | 1.00 | TRU: 6035, FAL: 10 |
| host_identity_verified | 1 | 1 | 0.58 | TRU: 3489, FAL: 2556 |
| neighbourhood_group_cleansed | 6046 | 0 | NaN | : |
| bathrooms | 6046 | 0 | NaN | : |
| has_availability | 0 | 1 | 1.00 | TRU: 6028, FAL: 18 |
| license | 6046 | 0 | NaN | : |
| instant_bookable | 0 | 1 | 0.28 | FAL: 4349, TRU: 1697 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| host_response_rate | 889 | 0.85 | 82.94 | 32.72 | 0.00 | 90.00 | 99.00 | 100.00 | 100.00 | <U+2581><U+2581><U+2581><U+2581><U+2587> |
| host_acceptance_rate | 1501 | 0.75 | 68.50 | 28.40 | 0.00 | 50.00 | 72.00 | 100.00 | 100.00 | <U+2581><U+2582><U+2585><U+2585><U+2587> |
| host_listings_count | 1 | 1.00 | 104.24 | 146.09 | 0.00 | 3.00 | 13.00 | 225.00 | 457.00 | <U+2587><U+2581><U+2582><U+2581><U+2582> |
| host_total_listings_count | 1 | 1.00 | 104.24 | 146.09 | 0.00 | 3.00 | 13.00 | 225.00 | 457.00 | <U+2587><U+2581><U+2582><U+2581><U+2582> |
| latitude | 0 | 1.00 | 22.30 | 0.05 | 22.16 | 22.28 | 22.30 | 22.31 | 22.56 | <U+2581><U+2587><U+2581><U+2581><U+2581> |
| longitude | 0 | 1.00 | 114.16 | 0.05 | 113.86 | 114.16 | 114.17 | 114.18 | 114.36 | <U+2581><U+2581><U+2582><U+2587><U+2581> |
| accommodates | 0 | 1.00 | 2.70 | 2.35 | 0.00 | 2.00 | 2.00 | 3.00 | 16.00 | <U+2587><U+2582><U+2581><U+2581><U+2581> |
| bedrooms | 1192 | 0.80 | 1.29 | 0.82 | 1.00 | 1.00 | 1.00 | 1.00 | 11.00 | <U+2587><U+2581><U+2581><U+2581><U+2581> |
| beds | 147 | 0.98 | 1.62 | 1.51 | 0.00 | 1.00 | 1.00 | 2.00 | 16.00 | <U+2587><U+2581><U+2581><U+2581><U+2581> |
| price | 0 | 1.00 | 805.04 | 2265.67 | 0.00 | 214.00 | 397.50 | 700.00 | 84346.00 | <U+2587><U+2581><U+2581><U+2581><U+2581> |
| minimum_nights | 0 | 1.00 | 17.02 | 30.10 | 1.00 | 1.00 | 5.00 | 29.00 | 1125.00 | <U+2587><U+2581><U+2581><U+2581><U+2581> |
| maximum_nights | 0 | 1.00 | 857.22 | 431.75 | 1.00 | 365.00 | 1125.00 | 1125.00 | 1125.00 | <U+2582><U+2581><U+2581><U+2581><U+2587> |
| minimum_nights_avg_ntm | 1 | 1.00 | 17.05 | 30.06 | 1.00 | 1.00 | 5.60 | 29.00 | 1125.00 | <U+2587><U+2581><U+2581><U+2581><U+2581> |
| maximum_nights_avg_ntm | 1 | 1.00 | 913.49 | 397.12 | 1.00 | 1124.00 | 1125.00 | 1125.00 | 1125.00 | <U+2582><U+2581><U+2581><U+2581><U+2587> |
| availability_30 | 0 | 1.00 | 21.40 | 12.02 | 0.00 | 13.00 | 29.00 | 30.00 | 30.00 | <U+2583><U+2581><U+2581><U+2581><U+2587> |
| availability_60 | 0 | 1.00 | 45.30 | 22.48 | 0.00 | 35.00 | 59.00 | 60.00 | 60.00 | <U+2582><U+2581><U+2581><U+2581><U+2587> |
| availability_90 | 0 | 1.00 | 70.12 | 32.17 | 0.00 | 61.00 | 89.00 | 90.00 | 90.00 | <U+2582><U+2581><U+2581><U+2581><U+2587> |
| availability_365 | 0 | 1.00 | 256.53 | 132.47 | 0.00 | 116.00 | 358.00 | 364.00 | 365.00 | <U+2582><U+2582><U+2582><U+2581><U+2587> |
| number_of_reviews | 0 | 1.00 | 17.62 | 45.93 | 0.00 | 0.00 | 0.00 | 10.00 | 891.00 | <U+2587><U+2581><U+2581><U+2581><U+2581> |
| number_of_reviews_ltm | 0 | 1.00 | 1.40 | 6.96 | 0.00 | 0.00 | 0.00 | 0.00 | 110.00 | <U+2587><U+2581><U+2581><U+2581><U+2581> |
| number_of_reviews_l30d | 0 | 1.00 | 0.12 | 0.65 | 0.00 | 0.00 | 0.00 | 0.00 | 16.00 | <U+2587><U+2581><U+2581><U+2581><U+2581> |
| review_scores_rating | 3067 | 0.49 | 4.38 | 0.92 | 0.00 | 4.29 | 4.65 | 4.89 | 5.00 | <U+2581><U+2581><U+2581><U+2581><U+2587> |
| review_scores_accuracy | 3130 | 0.48 | 4.58 | 0.67 | 0.00 | 4.50 | 4.78 | 5.00 | 5.00 | <U+2581><U+2581><U+2581><U+2581><U+2587> |
| review_scores_cleanliness | 3130 | 0.48 | 4.46 | 0.69 | 0.00 | 4.33 | 4.66 | 4.90 | 5.00 | <U+2581><U+2581><U+2581><U+2581><U+2587> |
| review_scores_checkin | 3130 | 0.48 | 4.67 | 0.62 | 0.00 | 4.65 | 4.86 | 5.00 | 5.00 | <U+2581><U+2581><U+2581><U+2581><U+2587> |
| review_scores_communication | 3131 | 0.48 | 4.68 | 0.61 | 1.00 | 4.67 | 4.88 | 5.00 | 5.00 | <U+2581><U+2581><U+2581><U+2581><U+2587> |
| review_scores_location | 3130 | 0.48 | 4.73 | 0.50 | 1.00 | 4.68 | 4.88 | 5.00 | 5.00 | <U+2581><U+2581><U+2581><U+2581><U+2587> |
| review_scores_value | 3130 | 0.48 | 4.48 | 0.65 | 1.00 | 4.35 | 4.64 | 4.85 | 5.00 | <U+2581><U+2581><U+2581><U+2581><U+2587> |
| calculated_host_listings_count | 0 | 1.00 | 96.62 | 137.08 | 1.00 | 3.00 | 11.00 | 185.00 | 376.00 | <U+2587><U+2581><U+2581><U+2581><U+2582> |
| calculated_host_listings_count_entire_homes | 0 | 1.00 | 18.74 | 38.32 | 0.00 | 0.00 | 2.00 | 16.00 | 155.00 | <U+2587><U+2581><U+2581><U+2581><U+2581> |
| calculated_host_listings_count_private_rooms | 0 | 1.00 | 70.73 | 119.53 | 0.00 | 0.00 | 5.00 | 49.00 | 333.00 | <U+2587><U+2581><U+2581><U+2581><U+2581> |
| calculated_host_listings_count_shared_rooms | 0 | 1.00 | 6.97 | 15.90 | 0.00 | 0.00 | 0.00 | 4.00 | 64.00 | <U+2587><U+2581><U+2581><U+2581><U+2581> |
| reviews_per_month | 3067 | 0.49 | 0.87 | 1.43 | 0.01 | 0.10 | 0.36 | 1.06 | 33.00 | <U+2587><U+2581><U+2581><U+2581><U+2581> |
# See minimum values for numeric values
summary(reduced_listings)%>%
kable(format = "html", caption = "Summary of Reduced Listings Data", format.args = list(scientific = FALSE, big.mark = ",")) %>% kable_classic()| host_since | host_location | host_response_time | host_response_rate | host_acceptance_rate | host_is_superhost | host_neighbourhood | host_listings_count | host_total_listings_count | host_has_profile_pic | host_identity_verified | neighbourhood | neighbourhood_cleansed | neighbourhood_group_cleansed | latitude | longitude | property_type | room_type | accommodates | bathrooms | bathrooms_text | bedrooms | beds | amenities | price | minimum_nights | maximum_nights | minimum_nights_avg_ntm | maximum_nights_avg_ntm | has_availability | availability_30 | availability_60 | availability_90 | availability_365 | number_of_reviews | number_of_reviews_ltm | number_of_reviews_l30d | review_scores_rating | review_scores_accuracy | review_scores_cleanliness | review_scores_checkin | review_scores_communication | review_scores_location | review_scores_value | license | instant_bookable | calculated_host_listings_count | calculated_host_listings_count_entire_homes | calculated_host_listings_count_private_rooms | calculated_host_listings_count_shared_rooms | reviews_per_month | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Min. :2010-01-09 | Length:6046 | Length:6046 | Min. : 0.00 | Min. : 0.0 | Mode :logical | Length:6046 | Min. : 0.0 | Min. : 0.0 | Mode :logical | Mode :logical | Length:6046 | Length:6046 | Mode:logical | Min. :22.16 | Min. :113.9 | Length:6046 | Length:6046 | Min. : 0.000 | Mode:logical | Length:6046 | Min. : 1.000 | Min. : 0.000 | Length:6046 | Min. : 0.0 | Min. : 1.00 | Min. : 1.0 | Min. : 1.00 | Min. : 1.0 | Mode :logical | Min. : 0.0 | Min. : 0.0 | Min. : 0.00 | Min. : 0.0 | Min. : 0.00 | Min. : 0.000 | Min. : 0.0000 | Min. :0.000 | Min. :0.000 | Min. :0.000 | Min. :0.000 | Min. :1.000 | Min. :1.000 | Min. :1.000 | Mode:logical | Mode :logical | Min. : 1.00 | Min. : 0.00 | Min. : 0.00 | Min. : 0.00 | Min. : 0.0100 | |
| 1st Qu.:2014-06-18 | Class :character | Class :character | 1st Qu.: 90.00 | 1st Qu.: 50.0 | FALSE:4718 | Class :character | 1st Qu.: 3.0 | 1st Qu.: 3.0 | FALSE:10 | FALSE:2556 | Class :character | Class :character | NA’s:6046 | 1st Qu.:22.28 | 1st Qu.:114.2 | Class :character | Class :character | 1st Qu.: 2.000 | NA’s:6046 | Class :character | 1st Qu.: 1.000 | 1st Qu.: 1.000 | Class :character | 1st Qu.: 214.0 | 1st Qu.: 1.00 | 1st Qu.: 365.0 | 1st Qu.: 1.00 | 1st Qu.:1124.0 | FALSE:18 | 1st Qu.:13.0 | 1st Qu.:35.0 | 1st Qu.:61.00 | 1st Qu.:116.0 | 1st Qu.: 0.00 | 1st Qu.: 0.000 | 1st Qu.: 0.0000 | 1st Qu.:4.290 | 1st Qu.:4.500 | 1st Qu.:4.330 | 1st Qu.:4.650 | 1st Qu.:4.670 | 1st Qu.:4.680 | 1st Qu.:4.350 | NA’s:6046 | FALSE:4349 | 1st Qu.: 3.00 | 1st Qu.: 0.00 | 1st Qu.: 0.00 | 1st Qu.: 0.00 | 1st Qu.: 0.1000 | |
| Median :2016-04-18 | Mode :character | Mode :character | Median : 99.00 | Median : 72.0 | TRUE :1327 | Mode :character | Median : 13.0 | Median : 13.0 | TRUE :6035 | TRUE :3489 | Mode :character | Mode :character | NA | Median :22.30 | Median :114.2 | Mode :character | Mode :character | Median : 2.000 | NA | Mode :character | Median : 1.000 | Median : 1.000 | Mode :character | Median : 397.5 | Median : 5.00 | Median :1125.0 | Median : 5.60 | Median :1125.0 | TRUE :6028 | Median :29.0 | Median :59.0 | Median :89.00 | Median :358.0 | Median : 0.00 | Median : 0.000 | Median : 0.0000 | Median :4.650 | Median :4.780 | Median :4.660 | Median :4.860 | Median :4.880 | Median :4.880 | Median :4.640 | NA | TRUE :1697 | Median : 11.00 | Median : 2.00 | Median : 5.00 | Median : 0.00 | Median : 0.3600 | |
| Mean :2016-05-03 | NA | NA | Mean : 82.94 | Mean : 68.5 | NA’s :1 | NA | Mean :104.2 | Mean :104.2 | NA’s :1 | NA’s :1 | NA | NA | NA | Mean :22.30 | Mean :114.2 | NA | NA | Mean : 2.696 | NA | NA | Mean : 1.285 | Mean : 1.618 | NA | Mean : 805.0 | Mean : 17.02 | Mean : 857.2 | Mean : 17.05 | Mean : 913.5 | NA | Mean :21.4 | Mean :45.3 | Mean :70.12 | Mean :256.5 | Mean : 17.62 | Mean : 1.399 | Mean : 0.1209 | Mean :4.384 | Mean :4.575 | Mean :4.465 | Mean :4.672 | Mean :4.681 | Mean :4.733 | Mean :4.475 | NA | NA | Mean : 96.62 | Mean : 18.74 | Mean : 70.73 | Mean : 6.97 | Mean : 0.8739 | |
| 3rd Qu.:2017-12-11 | NA | NA | 3rd Qu.:100.00 | 3rd Qu.:100.0 | NA | NA | 3rd Qu.:225.0 | 3rd Qu.:225.0 | NA | NA | NA | NA | NA | 3rd Qu.:22.31 | 3rd Qu.:114.2 | NA | NA | 3rd Qu.: 3.000 | NA | NA | 3rd Qu.: 1.000 | 3rd Qu.: 2.000 | NA | 3rd Qu.: 700.0 | 3rd Qu.: 29.00 | 3rd Qu.:1125.0 | 3rd Qu.: 29.00 | 3rd Qu.:1125.0 | NA | 3rd Qu.:30.0 | 3rd Qu.:60.0 | 3rd Qu.:90.00 | 3rd Qu.:364.0 | 3rd Qu.: 10.00 | 3rd Qu.: 0.000 | 3rd Qu.: 0.0000 | 3rd Qu.:4.890 | 3rd Qu.:5.000 | 3rd Qu.:4.900 | 3rd Qu.:5.000 | 3rd Qu.:5.000 | 3rd Qu.:5.000 | 3rd Qu.:4.850 | NA | NA | 3rd Qu.:185.00 | 3rd Qu.: 16.00 | 3rd Qu.: 49.00 | 3rd Qu.: 4.00 | 3rd Qu.: 1.0600 | |
| Max. :2021-09-19 | NA | NA | Max. :100.00 | Max. :100.0 | NA | NA | Max. :457.0 | Max. :457.0 | NA | NA | NA | NA | NA | Max. :22.56 | Max. :114.4 | NA | NA | Max. :16.000 | NA | NA | Max. :11.000 | Max. :16.000 | NA | Max. :84346.0 | Max. :1125.00 | Max. :1125.0 | Max. :1125.00 | Max. :1125.0 | NA | Max. :30.0 | Max. :60.0 | Max. :90.00 | Max. :365.0 | Max. :891.00 | Max. :110.000 | Max. :16.0000 | Max. :5.000 | Max. :5.000 | Max. :5.000 | Max. :5.000 | Max. :5.000 | Max. :5.000 | Max. :5.000 | NA | NA | Max. :376.00 | Max. :155.00 | Max. :333.00 | Max. :64.00 | Max. :33.0000 | |
| NA’s :1 | NA | NA | NA’s :889 | NA’s :1501 | NA | NA | NA’s :1 | NA’s :1 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA’s :1192 | NA’s :147 | NA | NA | NA | NA | NA’s :1 | NA’s :1 | NA | NA | NA | NA | NA | NA | NA | NA | NA’s :3067 | NA’s :3130 | NA’s :3130 | NA’s :3130 | NA’s :3131 | NA’s :3130 | NA’s :3130 | NA | NA | NA | NA | NA | NA | NA’s :3067 |
First, we will transform price into a new variable price_4_nights, decide whether it should be transformed to a logarithm, and change some categorical variables into fewer variables. Next, we will consider the correlation between variables. Rather than creating visualisations for all 51 variables, we will only create visualisations for the variables that are intuitively related to the price of a listing and are likely to be included in our regression model
For ease of analysis, we will create a price for 4 nights variable since that is the objective of this study
# Creating the price per 4 nights
reduced_listings <- reduced_listings %>%
mutate(price_4_nights = (price*4))Next, we will inspect the distribution of the variables
# Distribution of price_4_nights
ggplot(reduced_listings, aes(x = price_4_nights)) +
geom_density()+
labs(title = "Price per 4 nights", subtitle = "Density",
x = "Price per 4 nights",
y = "Number of listings") +
theme_economist_white()+theme(axis.title.y = element_text(margin = margin(r=3)),axis.title.x = element_text(margin=margin(t=3)))+
NULL Given the distribution of price_4_nights, it appears that the the price_4_nights variable is skewed to the right
To correct for this, we can log transform the variable to create a normal distribution, which will help with analysis and regression
# Creating the log(price per 4 nights)
reduced_listings <- reduced_listings %>%
mutate(log_price_4_nights=log(price_4_nights))
# Check distribution of data
# Distribution of log_price_4_nights
ggplot(reduced_listings, aes(x = log_price_4_nights)) +
geom_density()+
labs(title = "Log price per 4 nights", subtitle = "Density",
x = "Log price per 4 nights",
y = "Number of listings") +
theme_economist_white()+ theme(axis.title.y = element_text(margin = margin(r=3)),axis.title.x = element_text(margin=margin(t=3)))+
NULL The data is now approximately normally distributed
# Look at the distribution of property types
reduced_listings %>%
count(property_type, name="count", sort=TRUE) %>%
mutate(percent=round(100*count/sum(count),2)) %>%
rename(c("Property type" = "property_type", "Count" = "count", "Percentage" = "percent")) %>%
kable() %>%
kable_classic()| Property type | Count | Percentage |
|---|---|---|
| Private room in rental unit | 2120 | 35.06 |
| Entire rental unit | 1360 | 22.49 |
| Entire condominium (condo) | 419 | 6.93 |
| Entire serviced apartment | 347 | 5.74 |
| Shared room in rental unit | 236 | 3.90 |
| Private room in serviced apartment | 196 | 3.24 |
| Private room in condominium (condo) | 182 | 3.01 |
| Private room in hostel | 174 | 2.88 |
| Room in hotel | 118 | 1.95 |
| Private room in guesthouse | 116 | 1.92 |
| Room in boutique hotel | 98 | 1.62 |
| Private room in guest suite | 66 | 1.09 |
| Private room in residential home | 64 | 1.06 |
| Entire residential home | 61 | 1.01 |
| Shared room in hostel | 56 | 0.93 |
| Room in hostel | 52 | 0.86 |
| Private room in bed and breakfast | 44 | 0.73 |
| Entire guest suite | 37 | 0.61 |
| Room in aparthotel | 27 | 0.45 |
| Entire loft | 21 | 0.35 |
| Entire place | 20 | 0.33 |
| Room in bed and breakfast | 17 | 0.28 |
| Entire guesthouse | 16 | 0.26 |
| Shared room in condominium (condo) | 15 | 0.25 |
| Private room | 13 | 0.22 |
| Entire cottage | 12 | 0.20 |
| Private room in villa | 11 | 0.18 |
| Room in serviced apartment | 11 | 0.18 |
| Entire bungalow | 8 | 0.13 |
| Private room in tiny house | 8 | 0.13 |
| Private room in townhouse | 8 | 0.13 |
| Shared room in guesthouse | 8 | 0.13 |
| Shared room in residential home | 8 | 0.13 |
| Entire townhouse | 7 | 0.12 |
| Entire villa | 6 | 0.10 |
| Private room in loft | 6 | 0.10 |
| Tiny house | 6 | 0.10 |
| Houseboat | 5 | 0.08 |
| Private room in bungalow | 5 | 0.08 |
| Shared room in boutique hotel | 5 | 0.08 |
| Shared room in serviced apartment | 5 | 0.08 |
| Private room in cottage | 4 | 0.07 |
| Shared room in nature lodge | 4 | 0.07 |
| Boat | 3 | 0.05 |
| Farm stay | 3 | 0.05 |
| Shared room in bed and breakfast | 3 | 0.05 |
| Tent | 3 | 0.05 |
| Campsite | 2 | 0.03 |
| Cave | 2 | 0.03 |
| Pension | 2 | 0.03 |
| Private room in minsu | 2 | 0.03 |
| Shared room | 2 | 0.03 |
| Shared room in boat | 2 | 0.03 |
| Shared room in guest suite | 2 | 0.03 |
| Shared room in tiny house | 2 | 0.03 |
| Shared room in townhouse | 2 | 0.03 |
| Castle | 1 | 0.02 |
| Earth house | 1 | 0.02 |
| Entire home/apt | 1 | 0.02 |
| Hut | 1 | 0.02 |
| Island | 1 | 0.02 |
| Private room in boat | 1 | 0.02 |
| Private room in cabin | 1 | 0.02 |
| Private room in casa particular | 1 | 0.02 |
| Private room in earth house | 1 | 0.02 |
| Private room in hut | 1 | 0.02 |
| Private room in kezhan | 1 | 0.02 |
| Private room in nature lodge | 1 | 0.02 |
| Shared room in dome house | 1 | 0.02 |
| Shared room in pension | 1 | 0.02 |
# Organise into 5 groups
reduced_listings <- reduced_listings %>%
mutate(prop_type_simplified = case_when(
property_type %in% c("Private room in rental unit","Entire rental unit", "Entire condominium (condo)","Entire serviced apartment") ~ property_type,
TRUE ~ "Other"
))
# Check whether new categorisation has been successful
reduced_listings %>%
count(prop_type_simplified, name="count", sort=TRUE) %>%
mutate(percent=round(100*count/sum(count),2)) %>%
rename(c("Property type" = "prop_type_simplified", "Count" = "count", "Percentage" = "percent")) %>%
kable() %>%
kable_classic()| Property type | Count | Percentage |
|---|---|---|
| Private room in rental unit | 2120 | 35.06 |
| Other | 1800 | 29.77 |
| Entire rental unit | 1360 | 22.49 |
| Entire condominium (condo) | 419 | 6.93 |
| Entire serviced apartment | 347 | 5.74 |
The top four property types are Private room in rental unit, Entire rental unit, Entire condominium (condo) and Entire serviced apartment. Together, they make up approximately 70% of the total listings
# Create boxplot
chart_proptype <- reduced_listings %>%
ggplot(aes(x = prop_type_simplified, y = log_price_4_nights, group = prop_type_simplified)) +
geom_boxplot() +
coord_flip() +
theme_economist_white() +
labs(
title = "Entire condo and entire rental unit exhibit highest median prices",
subtitle = "Greatest dispersion in 'Other', as remaining variables are grouped here",
x = element_blank(),
y = "Log Price for 4 nights"
) +
theme(panel.border = element_blank())+theme(axis.title.y = element_text(margin = margin(r=3)),axis.title.x = element_text(margin=margin(t=3)))
chart_proptype The distribution is exactly as expected. Entire condos and entire rental units have the highest median prices. Private rooms in a rental unit are naturally the cheapest listings. As we grouped the remaining values of the variables in “Other”, there are many extreme outliers in this group.
# Look at the distribution of neighbourhoods
reduced_listings %>%
count(neighbourhood_cleansed, name="count", sort=TRUE) %>%
mutate(percent=round(100*count/sum(count),2)) %>%
rename(c("Neighbourhood" = "neighbourhood_cleansed", "Count" = "count", "Percentage" = "percent")) %>%
kable() %>%
kable_classic()| Neighbourhood | Count | Percentage |
|---|---|---|
| Yau Tsim Mong | 2351 | 38.89 |
| Wan Chai | 1370 | 22.66 |
| Central & Western | 920 | 15.22 |
| Islands | 268 | 4.43 |
| Kowloon City | 243 | 4.02 |
| Eastern | 233 | 3.85 |
| Sham Shui Po | 138 | 2.28 |
| Yuen Long | 130 | 2.15 |
| North | 120 | 1.98 |
| Sai Kung | 71 | 1.17 |
| Southern | 42 | 0.69 |
| Sha Tin | 41 | 0.68 |
| Tuen Mun | 35 | 0.58 |
| Kwun Tong | 27 | 0.45 |
| Tai Po | 23 | 0.38 |
| Tsuen Wan | 20 | 0.33 |
| Kwai Tsing | 7 | 0.12 |
| Wong Tai Sin | 7 | 0.12 |
# Organise into 5 groups
reduced_listings <- reduced_listings %>%
mutate(neighbourhood_categorical = case_when(
neighbourhood_cleansed %in% c("Yau Tsim Mong","Wan Chai", "Central & Western","Islands") ~ neighbourhood_cleansed,
TRUE ~ "Other"
))
# Check whether new categorisation has been successful
reduced_listings %>%
count(neighbourhood_categorical, name="count", sort=TRUE) %>%
mutate(percent=round(100*count/sum(count),2)) %>%
rename(c("Neighbourhood" = "neighbourhood_categorical", "Count" = "count", "Percentage" = "percent")) %>%
kable() %>%
kable_classic()| Neighbourhood | Count | Percentage |
|---|---|---|
| Yau Tsim Mong | 2351 | 38.89 |
| Wan Chai | 1370 | 22.66 |
| Other | 1137 | 18.81 |
| Central & Western | 920 | 15.22 |
| Islands | 268 | 4.43 |
The top four neighbourhoods are Yau Tsim Mong, Wan Chai, Central & Western and Islands. Together, they make up approximately 80% of the total listings.
# Create boxplot
chart_neighbourhood <- reduced_listings %>%
ggplot(aes(x = neighbourhood_categorical, y = log_price_4_nights, group = neighbourhood_categorical)) +
geom_boxplot() +
coord_flip() +
theme_economist_white() +
labs(
title = "Listings in the location 'Islands' have the highest median price",
subtitle = "Greatest dispersion in 'Other'",
x = element_blank(),
y = "Log Price for 4 nights"
) +
theme(panel.border = element_blank()) + theme(axis.title.y = element_text(margin = margin(r=3)),axis.title.x = element_text(margin=margin(t=3)))
chart_neighbourhood Overall, the boxplot shows many outliers with high prices for all neighbourhoods. As we expected, listings on islands have the highest median price. These should be more exotic locations for more affluent travelers with higher real estate prices. Wan Chai is one of the busiest commercial areas in Hong Kong. The buildings are tall, the apartments small and hence the prices for Airbnb apartments are also on the lower end.
# Look at the distribution of bathroom text
reduced_listings %>%
filter(bathrooms_text!="NA") %>%
count(bathrooms_text, name="count", sort=TRUE) %>%
mutate(percent=round(100*count/sum(count),2)) %>%
rename(c("Bathroom description" = "bathrooms_text", "Count" = "count", "Percentage" = "percent")) %>%
kable() %>%
kable_classic()| Bathroom description | Count | Percentage |
|---|---|---|
| 1 bath | 2840 | 47.14 |
| 1 private bath | 1196 | 19.85 |
| 1 shared bath | 1031 | 17.11 |
| 2 baths | 276 | 4.58 |
| 1.5 baths | 212 | 3.52 |
| 2 shared baths | 151 | 2.51 |
| 1.5 shared baths | 49 | 0.81 |
| 3 shared baths | 42 | 0.70 |
| 3 baths | 41 | 0.68 |
| 4 shared baths | 27 | 0.45 |
| 4 baths | 21 | 0.35 |
| 0 baths | 18 | 0.30 |
| Half-bath | 17 | 0.28 |
| Private half-bath | 16 | 0.27 |
| Shared half-bath | 16 | 0.27 |
| 2.5 baths | 14 | 0.23 |
| 2.5 shared baths | 10 | 0.17 |
| 0 shared baths | 9 | 0.15 |
| 5 baths | 7 | 0.12 |
| 8 baths | 6 | 0.10 |
| 9 baths | 4 | 0.07 |
| 10 baths | 3 | 0.05 |
| 3.5 shared baths | 3 | 0.05 |
| 6 baths | 3 | 0.05 |
| 5.5 baths | 2 | 0.03 |
| 10 shared baths | 1 | 0.02 |
| 11 baths | 1 | 0.02 |
| 3.5 baths | 1 | 0.02 |
| 4.5 baths | 1 | 0.02 |
| 4.5 shared baths | 1 | 0.02 |
| 5 shared baths | 1 | 0.02 |
| 5.5 shared baths | 1 | 0.02 |
| 7 baths | 1 | 0.02 |
| 8 shared baths | 1 | 0.02 |
| 9 shared baths | 1 | 0.02 |
# Organise into 5 groups
reduced_listings <- reduced_listings %>%
mutate(bathrooms = case_when(
bathrooms_text %in% c("1 bath","1 private bath", "1 shared bath","2 baths") ~ bathrooms_text,
TRUE ~ "Other"
))
# Check whether new categorisation has been successful
reduced_listings %>%
count(bathrooms, name="count", sort=TRUE) %>%
mutate(percent=round(100*count/sum(count),2)) %>%
rename(c("Bathroom description" = "bathrooms", "Count" = "count", "Percentage" = "percent")) %>%
kable() %>%
kable_classic()| Bathroom description | Count | Percentage |
|---|---|---|
| 1 bath | 2840 | 46.97 |
| 1 private bath | 1196 | 19.78 |
| 1 shared bath | 1031 | 17.05 |
| Other | 703 | 11.63 |
| 2 baths | 276 | 4.57 |
The top four bathroom descriptions are 1 bath, 1 private bath, 1 shared bath and 2 baths. Together, they make up approximately 90% of the total listings.
# Create boxplot
chart_baths <- reduced_listings %>%
ggplot(aes(x = bathrooms, y = log_price_4_nights, group = bathrooms)) +
geom_boxplot() +
coord_flip() +
theme_economist_white() +
labs(
title = "The more bathrooms the higher the price",
x = element_blank(),
y = "Log Price for 4 nights"
) +
theme(panel.border = element_blank()) + theme(axis.title.y = element_text(margin = margin(r=3)),axis.title.x = element_text(margin=margin(t=3)))
chart_baths No surprises to see here. One shared bathroom clearly corresponds to the lowest price. A private bathroom is more comfortable to use and has a higher median price. Two bathrooms correspond by far to the highest price. All of this was expected.
To expand our analysis, we have decided to decompose the amenities into important amenities, accessibility amenities, space amenities and gear amenities
# Grouping together amenities for data analysis
reduced_listings$amenity_count <- str_count(reduced_listings$amenities,",")+1
head(reduced_listings$amenities)[1] "[\"Washer\", \"Air conditioning\", \"Long term stays allowed\", \"Wifi\", \"Kitchen\", \"Elevator\"]"
[2] "[\"Heating\", \"Air conditioning\", \"Shampoo\", \"Long term stays allowed\", \"Essentials\", \"Bathtub\", \"Coffee maker\", \"Dishes and silverware\", \"Carbon monoxide alarm\", \"TV with standard cable\", \"Iron\", \"Oven\", \"Hot water\", \"Room-darkening shades\", \"Kitchen\", \"Cable TV\", \"Dryer\", \"First aid kit\", \"Wifi\", \"Cooking basics\", \"Hair dryer\", \"Washer\", \"Hangers\", \"Breakfast\", \"Dedicated workspace\", \"Refrigerator\", \"Smoke alarm\", \"Stove\", \"Lockbox\"]"
[3] "[\"Heating\", \"Washer\", \"Air conditioning\", \"Crib\", \"Shampoo\", \"TV with standard cable\", \"Long term stays allowed\", \"Iron\", \"Wifi\", \"Dedicated workspace\", \"Cooking basics\", \"Kitchen\", \"Hair dryer\", \"Lockbox\", \"Essentials\", \"Cable TV\"]"
[4] "[\"Air conditioning\", \"TV with standard cable\", \"Long term stays allowed\", \"Wifi\", \"Kitchen\", \"Cable TV\"]"
[5] "[\"Heating\", \"Washer\", \"Air conditioning\", \"Crib\", \"Shampoo\", \"TV with standard cable\", \"Long term stays allowed\", \"Iron\", \"Wifi\", \"Dedicated workspace\", \"Cooking basics\", \"Kitchen\", \"Hair dryer\", \"Lockbox\", \"Essentials\", \"Cable TV\", \"Elevator\"]"
[6] "[\"Heating\", \"Air conditioning\", \"Shampoo\", \"Long term stays allowed\", \"Essentials\", \"Dishes and silverware\", \"TV with standard cable\", \"Iron\", \"Hot water\", \"Kitchen\", \"Cable TV\", \"Crib\", \"Microwave\", \"Wifi\", \"Cooking basics\", \"Hair dryer\", \"Hangers\", \"Fire extinguisher\", \"Dedicated workspace\", \"Refrigerator\", \"Smoke alarm\", \"Lockbox\"]"
amenities <- unique(reduced_listings$amenities)
head(reduced_listings$amenities)[1] "[\"Washer\", \"Air conditioning\", \"Long term stays allowed\", \"Wifi\", \"Kitchen\", \"Elevator\"]"
[2] "[\"Heating\", \"Air conditioning\", \"Shampoo\", \"Long term stays allowed\", \"Essentials\", \"Bathtub\", \"Coffee maker\", \"Dishes and silverware\", \"Carbon monoxide alarm\", \"TV with standard cable\", \"Iron\", \"Oven\", \"Hot water\", \"Room-darkening shades\", \"Kitchen\", \"Cable TV\", \"Dryer\", \"First aid kit\", \"Wifi\", \"Cooking basics\", \"Hair dryer\", \"Washer\", \"Hangers\", \"Breakfast\", \"Dedicated workspace\", \"Refrigerator\", \"Smoke alarm\", \"Stove\", \"Lockbox\"]"
[3] "[\"Heating\", \"Washer\", \"Air conditioning\", \"Crib\", \"Shampoo\", \"TV with standard cable\", \"Long term stays allowed\", \"Iron\", \"Wifi\", \"Dedicated workspace\", \"Cooking basics\", \"Kitchen\", \"Hair dryer\", \"Lockbox\", \"Essentials\", \"Cable TV\"]"
[4] "[\"Air conditioning\", \"TV with standard cable\", \"Long term stays allowed\", \"Wifi\", \"Kitchen\", \"Cable TV\"]"
[5] "[\"Heating\", \"Washer\", \"Air conditioning\", \"Crib\", \"Shampoo\", \"TV with standard cable\", \"Long term stays allowed\", \"Iron\", \"Wifi\", \"Dedicated workspace\", \"Cooking basics\", \"Kitchen\", \"Hair dryer\", \"Lockbox\", \"Essentials\", \"Cable TV\", \"Elevator\"]"
[6] "[\"Heating\", \"Air conditioning\", \"Shampoo\", \"Long term stays allowed\", \"Essentials\", \"Dishes and silverware\", \"TV with standard cable\", \"Iron\", \"Hot water\", \"Kitchen\", \"Cable TV\", \"Crib\", \"Microwave\", \"Wifi\", \"Cooking basics\", \"Hair dryer\", \"Hangers\", \"Fire extinguisher\", \"Dedicated workspace\", \"Refrigerator\", \"Smoke alarm\", \"Lockbox\"]"
#remove excess characters
amenities = str_remove_all(amenities,"\"")
amenities = str_remove_all(amenities,"\\[")
amenities = str_remove_all(amenities,"\\]")
amenities = str_trim(amenities)
total_amenities <- str_trim(unlist(strsplit(amenities,"[,]")))
head(total_amenities,10) [1] "Washer" "Air conditioning"
[3] "Long term stays allowed" "Wifi"
[5] "Kitchen" "Elevator"
[7] "Heating" "Air conditioning"
[9] "Shampoo" "Long term stays allowed"
as.data.frame(table(total_amenities))%>%
arrange(desc(Freq)) %>%
rename(c("Amenities" = "total_amenities", "Frequency" = "Freq")) %>%
kable() %>%
kable_classic()| Amenities | Frequency |
|---|---|
| Air conditioning | 3702 |
| Long term stays allowed | 3645 |
| Wifi | 3613 |
| Essentials | 3133 |
| Hangers | 2851 |
| Hair dryer | 2747 |
| Shampoo | 2690 |
| Kitchen | 2469 |
| Elevator | 2395 |
| TV | 2323 |
| Hot water | 2268 |
| Dedicated workspace | 2193 |
| Washer | 2170 |
| Iron | 1959 |
| Smoke alarm | 1894 |
| Fire extinguisher | 1861 |
| Refrigerator | 1301 |
| Private entrance | 1283 |
| Bed linens | 1183 |
| Lock on bedroom door | 1144 |
| Cooking basics | 1069 |
| Dishes and silverware | 1025 |
| Heating | 1019 |
| Dryer | 979 |
| First aid kit | 975 |
| Microwave | 962 |
| Carbon monoxide alarm | 860 |
| Stove | 735 |
| Luggage dropoff allowed | 686 |
| Extra pillows and blankets | 651 |
| Cable TV | 526 |
| TV with standard cable | 511 |
| Host greets you | 457 |
| Paid parking off premises | 381 |
| Shower gel | 374 |
| Oven | 369 |
| Gym | 363 |
| Room-darkening shades | 347 |
| Coffee maker | 344 |
| Keypad | 344 |
| Hot water kettle | 306 |
| Patio or balcony | 304 |
| Smart lock | 302 |
| Ethernet connection | 294 |
| Pool | 272 |
| Cleaning before checkout | 258 |
| Lockbox | 225 |
| Hot tub | 219 |
| Free parking on premises | 217 |
| Paid parking on premises | 196 |
| Security cameras on property | 195 |
| Dining table | 194 |
| Bathtub | 193 |
| Freezer | 192 |
| Cleaning products | 175 |
| Building staff | 173 |
| Breakfast | 169 |
| Window guards | 157 |
| Free street parking | 153 |
| Body soap | 151 |
| Crib | 148 |
| Single level home | 144 |
| BBQ grill | 141 |
| Clothing storage | 136 |
| Drying rack for clothing | 131 |
| Conditioner | 125 |
| Wine glasses | 121 |
| Children books and toys | 112 |
| Indoor fireplace | 112 |
| Rice maker | 111 |
| Backyard | 106 |
| Toaster | 104 |
| Beachfront | 100 |
| High chair | 96 |
| Outdoor furniture | 96 |
| Laundromat nearby | 89 |
| Outdoor dining area | 77 |
| Pack Play/travel crib | 73 |
| Mini fridge | 72 |
| Waterfront | 71 |
| Pocket wifi | 70 |
| Baby bath | 68 |
| Children dinnerware | 68 |
| Portable fans | 67 |
| Dishwasher | 63 |
| Safe | 62 |
| Babysitter recommendations | 52 |
| Clothing storage: closet | 51 |
| Barbecue utensils | 50 |
| Baking sheet | 49 |
| Free washer In unit | 49 |
| Table corner guards | 44 |
| Game console | 40 |
| Outlet covers | 39 |
| Private patio or balcony | 39 |
| Sound system | 35 |
| Board games | 32 |
| Nespresso machine | 29 |
| EV charger | 26 |
| Bread maker | 24 |
| Electric stove | 22 |
| Portable heater | 22 |
| Changing table | 21 |
| Beach essentials | 20 |
| Ceiling fan | 20 |
| Outdoor shower | 19 |
| Sound system with Bluetooth and aux | 19 |
| Window AC unit | 18 |
| Bluetooth sound system | 16 |
| Induction stove | 16 |
| Pour-over coffee | 16 |
| Free dryer In unit | 14 |
| Baby safety gates | 13 |
| Clothing storage: wardrobe | 13 |
| Washer unit | 13 |
| Netflix | 12 |
| Stainless steel electric stove | 12 |
| Wifi 1000 Mbps | 12 |
| Dedicated workspace: desk | 11 |
| Piano | 11 |
| Stainless steel oven | 11 |
| Dedicated workspace: table | 10 |
| Mosquito net | 10 |
| Shared pool | 10 |
| standard cable | 10 |
| Gas stove | 9 |
| Private hot tub | 9 |
| Central air conditioning | 8 |
| Central heating | 8 |
| Lake access | 8 |
| Paid parking garage off premises | 8 |
| Panasonic refrigerator | 8 |
| Record player | 8 |
| Baby monitor | 7 |
| Paid street parking off premises | 7 |
| Trash compactor | 7 |
| Washer building | 7 |
| Clothing storage: wardrobe and closet | 6 |
| Dryer unit | 6 |
| Bidet | 5 |
| Bikes | 5 |
| Clothing storage: closet and wardrobe | 5 |
| Game console: PS4 | 5 |
| HDTV with Netflix | 5 |
| premium cable | 5 |
| Shared patio or balcony | 5 |
| 32 HDTV with Chromecast | 4 |
| and office chair | 4 |
| Clothing storage: dresser | 4 |
| Dryer In building | 4 |
| Fire pit | 4 |
| Fireplace guards | 4 |
| Kayak | 4 |
| Paid parking lot off premises | 4 |
| Ping pong table | 4 |
| Pool table | 4 |
| Private fenced garden or backyard | 4 |
| Private garden or backyard | 4 |
| Radiant heating | 4 |
| Stainless steel induction stove | 4 |
| and desk | 3 |
| Clothing storage: closet and dresser | 3 |
| Clothing storage: dresser and closet | 3 |
| Clothing storage: walk-in closet | 3 |
| Dedicated workspace: desk and office chair | 3 |
| Dedicated workspace: monitor | 3 |
| Free dryer In building | 3 |
| Free washer In building | 3 |
| HBO Max | 3 |
| monitor | 3 |
| office chair | 3 |
| Private gym in building | 3 |
| Private pool | 3 |
| Sauna | 3 |
| Shared garden or backyard | 3 |
| Shared sauna | 3 |
| table | 3 |
| Wifi 100 Mbps | 3 |
| conditioner | 2 |
| and closet | 2 |
| and wardrobe | 2 |
| Apple TV | 2 |
| Boat slip | 2 |
| Clothing storage: wardrobe and walk-in closet | 2 |
| desk | 2 |
| dresser | 2 |
| Free driveway parking on premises | 2 |
| Free dryer | 2 |
| HDTV | 2 |
| HDTV with standard cable | 2 |
| Keurig coffee machine | 2 |
| Shared gym | 2 |
| Shared gym in building | 2 |
| Ski-in/Ski-out | 2 |
| Stainless steel gas stove | 2 |
| wardrobe | 2 |
| shampoo | 1 |
| 32 HDTV | 1 |
| 40 HDTV | 1 |
| 40 TV with standard cable | 1 |
| 42 HDTV with Apple TV | 1 |
| 42 HDTV with HBO Max | 1 |
| 42 HDTV with Netflix | 1 |
| 42 HDTV with standard cable | 1 |
| 43 HDTV | 1 |
| 44 HDTV with Amazon Prime Video | 1 |
| 48 HDTV with Apple TV | 1 |
| 50 HDTV with Apple TV | 1 |
| 55 HDTV with standard cable | 1 |
| 55 TV with standard cable | 1 |
| 65 HDTV with Amazon Prime Video | 1 |
| 65 TV | 1 |
| Aesop body soap | 1 |
| and table | 1 |
| Any body soap | 1 |
| B&O sound system with Bluetooth and aux | 1 |
| Bathrobes | 1 |
| Bed Head TIGI shampoo | 1 |
| Bed sheets and pillows | 1 |
| Bluetooth speaker | 1 |
| bose Bluetooth sound system | 1 |
| Bottled water | 1 |
| Breakfast buffet available $295 per person per day | 1 |
| CAMBRIDGE Bluetooth sound system | 1 |
| Children books and toys for ages 2-5 years old | 1 |
| Children books and toys for ages 2-5 years old and 5-10 years old | 1 |
| Chromecast | 1 |
| Clothing storage: closet and walk-in closet | 1 |
| Clothing storage: walk-in closet and dresser | 1 |
| Complimentary self parking | 1 |
| Complimentary valet parking | 1 |
| Concierge | 1 |
| Dedicated workspace: office chair | 1 |
| Dedicated workspace: office chair and desk | 1 |
| Dedicated workspace: office chair and table | 1 |
| Dedicated workspace: table and desk | 1 |
| Depends body soap | 1 |
| Dole body soap | 1 |
| Dove or similar body soap | 1 |
| Elemis body soap | 1 |
| Fitness center | 1 |
| Fortress refrigerator | 1 |
| Free carport on premises 3 spaces | 1 |
| Free washer | 1 |
| Free wifi | 1 |
| Gaggenau refrigerator | 1 |
| Game console: Nintendo Switch | 1 |
| Game console: Nintendo Wii | 1 |
| HDTV with Apple TV | 1 |
| Heated pool with poolside bar outdoor | 1 |
| Hitachi refrigerator | 1 |
| I have a small oven to cook basics. It is not big enough to roast a chicken! oven | 1 |
| J&J body soap | 1 |
| JBL Bluetooth sound system | 1 |
| JBL speaker Bluetooth sound system | 1 |
| Laundry services | 1 |
| LG refrigerator | 1 |
| Limited housekeeping weekly | 1 |
| Marshall Bluetooth sound system | 1 |
| Minibar | 1 |
| Onsite restaurant Above & Beyond | 1 |
| Paid dryer In unit | 1 |
| Paid valet parking on premises | 1 |
| Paid washer In building | 1 |
| Pansonic refrigerator | 1 |
| Phill Smith Be Gourgeous conditioner | 1 |
| Portable air conditioning | 1 |
| Private outdoor pool | 1 |
| Rejoice shampoo | 1 |
| Room service | 1 |
| samsung refrigerator | 1 |
| shampoo and conditioner. Shower wash. shampoo | 1 |
| Shared fenced garden or backyard | 1 |
| Shared outdoor heated infinity pool | 1 |
| Shared outdoor heated pool | 1 |
| Simpa stainless steel gas stove | 1 |
| Slippers | 1 |
| Speaker Bluetooth sound system | 1 |
| Supply the basics | 1 |
| Thai organic brand conditioner | 1 |
| Thai organic brand shampoo | 1 |
| Toiletries | 1 |
| toshiba stainless steel oven | 1 |
| Tresseme conditioner | 1 |
| TV with Amazon Prime Video | 1 |
| TV with Apple TV | 1 |
| TV with Netflix | 1 |
| Unknow oven | 1 |
| Unknown body soap | 1 |
| walk-in closet | 1 |
| Whirlpool induction stove | 1 |
| Whirpool refrigerator | 1 |
| White Westinhouse refrigerator | 1 |
| Wifi 174 Mbps | 1 |
| Wifi 20 Mbps | 1 |
reduced_listings$amenities = str_remove_all(reduced_listings$amenities,"\"")
reduced_listings$amenities = str_remove_all(reduced_listings$amenities,"\\[")
reduced_listings$amenities = str_remove_all(reduced_listings$amenities,"\\]")
reduced_listings$amenities = str_trim(reduced_listings$amenities)
head(reduced_listings$amenities)[1] "Washer, Air conditioning, Long term stays allowed, Wifi, Kitchen, Elevator"
[2] "Heating, Air conditioning, Shampoo, Long term stays allowed, Essentials, Bathtub, Coffee maker, Dishes and silverware, Carbon monoxide alarm, TV with standard cable, Iron, Oven, Hot water, Room-darkening shades, Kitchen, Cable TV, Dryer, First aid kit, Wifi, Cooking basics, Hair dryer, Washer, Hangers, Breakfast, Dedicated workspace, Refrigerator, Smoke alarm, Stove, Lockbox"
[3] "Heating, Washer, Air conditioning, Crib, Shampoo, TV with standard cable, Long term stays allowed, Iron, Wifi, Dedicated workspace, Cooking basics, Kitchen, Hair dryer, Lockbox, Essentials, Cable TV"
[4] "Air conditioning, TV with standard cable, Long term stays allowed, Wifi, Kitchen, Cable TV"
[5] "Heating, Washer, Air conditioning, Crib, Shampoo, TV with standard cable, Long term stays allowed, Iron, Wifi, Dedicated workspace, Cooking basics, Kitchen, Hair dryer, Lockbox, Essentials, Cable TV, Elevator"
[6] "Heating, Air conditioning, Shampoo, Long term stays allowed, Essentials, Dishes and silverware, TV with standard cable, Iron, Hot water, Kitchen, Cable TV, Crib, Microwave, Wifi, Cooking basics, Hair dryer, Hangers, Fire extinguisher, Dedicated workspace, Refrigerator, Smoke alarm, Lockbox"
# Find count of important amenities in amenities
reduced_listings$important_amenities = ifelse(grepl("Wifi",reduced_listings$amenities,ignore.case = TRUE,fixed = TRUE),1,0)+ifelse(grepl("Air conditioning",reduced_listings$amenities,ignore.case = TRUE,fixed = TRUE),1,0)+ifelse(grepl("Essentials",reduced_listings$amenities,ignore.case = TRUE,fixed = TRUE),1,0)
head(reduced_listings$important_amenities) [1] 2 3 3 2 3 3
# Find count of accessibility amenities in amenities
reduced_listings$accessibility = ifelse(grepl("Elevator",reduced_listings$amenities,ignore.case = TRUE,fixed = TRUE),1,0)+ifelse(grepl("Private Entrance",reduced_listings$amenities,ignore.case = TRUE,fixed = TRUE),1,0)
head(reduced_listings$accessibility)[1] 1 0 0 0 1 0
#find count of space amenities in amenities
reduced_listings$spacing = ifelse(grepl("Kitchen",reduced_listings$amenities,ignore.case = TRUE,fixed = TRUE),1,0)+ifelse(grepl("Workspace",reduced_listings$amenities,ignore.case = TRUE,fixed = TRUE),1,0)
head(reduced_listings$spacing)[1] 1 1 1 1 1 1
#find count of gear amenities in amenities
reduced_listings$gear_amenities = ifelse(grepl("TV",reduced_listings$amenities,ignore.case = TRUE,fixed = TRUE),1,0)+ifelse(grepl("Washer",reduced_listings$amenities,ignore.case = TRUE,fixed = TRUE),1,0)
head(reduced_listings$accessibility)[1] 1 0 0 0 1 0
# Show correlation among different review scores to find potential multicollinearity
reduced_listings %>%
select(log_price_4_nights, number_of_reviews,
review_scores_rating, review_scores_cleanliness,
review_scores_location, review_scores_value, reviews_per_month) %>%
ggpairs(alpha=0.3)+
theme_bw() Many of the review variables exhibit high correlation and linear relationships with each other. Especially review_score_value, review_score_rating and review_score_cleanliness exhibit correlation. To avoid multicollinearity in our model, we decided to only use one of them. At first we used review_scores_cleanliness because it has the highest correlation with price. Later on in the process, we decided to use number_of_reviews instead, as it increased the explanatory power of our model and fits better logically
# Show correlation among different numerical variables to find potential multicollinearity
reduced_listings %>%
select(log_price_4_nights, bedrooms, accommodates, number_of_reviews, host_acceptance_rate ) %>%
ggpairs(alpha=0.3)+
theme_bw() There is some correlation among the numerical values. Number of accomodates and bedrooms show a positive significant correlation of 0.5. This was expected as more accomodates require more beds and bedrooms. The other notable correlation we found was between host acceptance rate and number of reviews. This relationships also makes sense. The more guests a hosts accepts, the more reviews they will get.
# Show correlation among values of categorical variables
reduced_listings_2 <- reduced_listings %>%
select(log_price_4_nights, bathrooms, neighbourhood_categorical, prop_type_simplified, room_type)
model.matrix(~0+., data=reduced_listings_2) %>%
cor(use="everything") %>%
ggcorrplot(show.diag = F, type="lower", lab=TRUE, lab_size=4) Some features of the categorical variables seem to be correlated. Entire rental units often feature 1 bathroom, so do entire service apartments. Private rooms in a rental unit often have only a shared bathroom, which is what you would expect. Apartments in the area Yau Tsim Mong tend to have one private bathroom. The other results do not tell us much. Ignore all correlation coefficients for mutually exclusive variables. Obviously, the property type private room in rental unit is heavily correlated with the room type private room
# Show correlation among categorical variables
reduced_listings_3 <- reduced_listings %>%
select(log_price_4_nights, bathrooms, neighbourhood_categorical, prop_type_simplified, accommodates, host_acceptance_rate, number_of_reviews, bathrooms, bedrooms)
model.matrix(~0+., data=reduced_listings_3) %>%
cor(use="everything") %>%
ggcorrplot(show.diag = F, type="lower", lab=TRUE, lab_size=4) Across numerical and categorical variables, there is limited correlation. Accomodates (numerical) and 1 shared bathroom (categorical) exhibit slightly negative correlation. On the other hand, the correlation with 2 bathrooms is positive. This is what you would expect. The larger the number of accomodates, the more bathrooms there are. Accomodates is also negatively correlated to private rooms and positively correlated to entire rental units. Again, the same argument about size applies. In terms of geographical location, the Islands neighbourhood shows the highest correlation with accomodates. We expect the islands to be among the most expensive and largests domiciles and hence with the largest number of accomodates. The neighbourhood Wan Chai shows the highest negative correlation with number of reviews. This is perhaps a sign that Wan Chai is one of the most heavily populated areas with more listings than in demand.
# Histogram for host response rate
ggplot(reduced_listings, aes(x = host_response_rate)) +
geom_histogram(color="white") +
labs(title = "Host response rate distribution", subtitle = "Histogram",
x = "Host response rate (%)",
y = "Number of listings") +
theme_economist()+theme(axis.title.y = element_text(margin = margin(r=3)),axis.title.x = element_text(margin=margin(t=3)))+
NULL# Histogram for host acceptance rate
ggplot(reduced_listings, aes(x = host_acceptance_rate)) +
geom_histogram(color="white") +
labs(title = "Host acceptance rate distribution", subtitle = "Histogram",
x = "Host acceptance rate (%)",
y = "Number of listings") +
theme_economist()+theme(axis.title.y = element_text(margin = margin(r=3)),axis.title.x = element_text(margin=margin(t=3)))+
NULL# Histogram for accommodates
ggplot(reduced_listings, aes(x = accommodates)) +
geom_histogram(color="white") +
labs(title = "Accommodates distribution", subtitle = "Histogram",
x = "Number of person that can be accommodated",
y = "Number of listings") +
theme_economist()+theme(axis.title.y = element_text(margin = margin(r=3)),axis.title.x = element_text(margin=margin(t=3)))+
NULL# Density plot for beds distribution
ggplot(reduced_listings, aes(x = beds)) +
geom_density()+
labs(title = "Beds distribution", subtitle = "Density",
x = "Number of beds",
y = "Number of listings") +
theme_economist()+theme(axis.title.y = element_text(margin = margin(r=3)),axis.title.x = element_text(margin=margin(t=3)))+
NULL# Density plot for availability_30
ggplot(reduced_listings, aes(x = availability_30)) +
geom_density()+
labs(title = "Availability over next 30 days distribution", subtitle = "Density",
x = "Availability over next 30 days (in days)",
y = "Number of listings") +
theme_economist()+theme(axis.title.y = element_text(margin = margin(r=3)),axis.title.x = element_text(margin=margin(t=3)))+
NULL# Barchart for host response time
reduced_listings %>%
filter(host_response_time!="NA") %>%
ggplot(aes(x = host_response_time))+
geom_bar()+
labs(title = "Host response time distribution", subtitle = "Barchart",
x = "Host response time",
y = "Number of listings") +
theme_economist()+theme(axis.title.y = element_text(margin = margin(r=3)),axis.title.x = element_text(margin=margin(t=3)))+
NULL# Barchart for availability_30
reduced_listings %>%
filter(host_is_superhost!="NA") %>%
ggplot(aes(x = host_is_superhost))+
geom_bar()+
labs(title = "Superhost distribution", subtitle = "Barchart",
x = "Is the host a superhost?",
y = "Number of listings") +
theme_economist()+theme(axis.title.y = element_text(margin = margin(r=3)),axis.title.x = element_text(margin=margin(t=3)))+
NULL Here, we used some charts to visualise some important values, we expect to have a great influence on the price. It also allows us to identify outliers (e.g. host response rate of 0%, number of accomodates > 16).
Given the fact that we are building a model for four nights, it is important to get a better understanding of the distribution of the minimum nights
# Table with minimum nights
reduced_listings %>%
count(minimum_nights, name="count", sort=TRUE) %>%
rename("Minimum nights" = minimum_nights, "Frequency"="count") %>%
kable(align = "ll") %>%
kable_classic()| Minimum nights | Frequency |
|---|---|
| 1 | 2212 |
| 29 | 962 |
| 30 | 636 |
| 28 | 595 |
| 2 | 425 |
| 3 | 243 |
| 7 | 194 |
| 31 | 124 |
| 5 | 104 |
| 60 | 72 |
| 14 | 69 |
| 90 | 61 |
| 10 | 46 |
| 4 | 41 |
| 180 | 34 |
| 27 | 27 |
| 15 | 25 |
| 6 | 24 |
| 20 | 23 |
| 21 | 20 |
| 25 | 19 |
| 26 | 16 |
| 365 | 11 |
| 8 | 10 |
| 22 | 7 |
| 100 | 5 |
| 150 | 5 |
| 45 | 4 |
| 32 | 3 |
| 56 | 3 |
| 9 | 2 |
| 12 | 2 |
| 13 | 2 |
| 39 | 2 |
| 61 | 2 |
| 120 | 2 |
| 19 | 1 |
| 35 | 1 |
| 50 | 1 |
| 62 | 1 |
| 70 | 1 |
| 80 | 1 |
| 89 | 1 |
| 94 | 1 |
| 99 | 1 |
| 130 | 1 |
| 250 | 1 |
| 300 | 1 |
| 360 | 1 |
| 1125 | 1 |
The most common number of minimum nights is 1, followed by 29, 30, 28, 2, 3 and 7. Out of this list, the numbers 29, 30 and 28 clearly stand out. These “long-term” Airbnb bookings are different from a regular short-term booking, and are very likely to be a lease or sublet.
Since observations with a minimum of more than 4 nights would not be feasible for a model that predicts the price for four nights, these have been filtered out. As mentioned in the EDA, observations with a price of 0 and accommodates of 0 will be eliminated. Although one can argue that a maximum_nights of at least 4 nights could be added as a criteria, we have argued against it as someone can make a new booking with the new listing to prolong his or her stay
#Filtering the dataframe for minimum nights and faulty observations
reduced_listings <- reduced_listings %>%
filter (minimum_nights <= "4", price != "0", accommodates != 0)Below, you can find a map which shows all Airbnbs where the minimum nights is less than or equal to four nights in Hong Kong. Darker colors indicate more expensive airbnbs.
pal <- colorNumeric(palette = "BuPu", domain = reduced_listings$log_price_4_nights)
leaflet(data = filter(listings, minimum_nights <= 4)) %>%
addProviderTiles("OpenStreetMap.Mapnik") %>%
addCircleMarkers(lng = ~longitude,
lat = ~latitude,
radius = 1,
fillColor = ~pal(log(price)),
fillOpacity = 0.4,
popup = ~listing_url,
label = ~property_type,
color = ~pal(log(price)))model1 <- lm(price_4_nights ~factor(prop_type_simplified) + number_of_reviews + review_scores_rating, data=reduced_listings)
msummary(model1) Estimate Std. Error
(Intercept) 3609.492 1202.609
factor(prop_type_simplified)Entire rental unit 1301.983 893.123
factor(prop_type_simplified)Entire serviced apartment 394.116 1530.659
factor(prop_type_simplified)Other -242.214 876.853
factor(prop_type_simplified)Private room in rental unit -836.183 898.410
number_of_reviews -6.880 3.217
review_scores_rating 110.317 200.145
t value Pr(>|t|)
(Intercept) 3.001 0.00271 **
factor(prop_type_simplified)Entire rental unit 1.458 0.14502
factor(prop_type_simplified)Entire serviced apartment 0.257 0.79683
factor(prop_type_simplified)Other -0.276 0.78239
factor(prop_type_simplified)Private room in rental unit -0.931 0.35207
number_of_reviews -2.139 0.03255 *
review_scores_rating 0.551 0.58155
Residual standard error: 9566 on 2703 degrees of freedom
(2852 observations deleted due to missingness)
Multiple R-squared: 0.00887, Adjusted R-squared: 0.00667
F-statistic: 4.032 on 6 and 2703 DF, p-value: 0.0005005
The coefficient of review_scores_rating is -6.398. This means that the price per 4 nights will decrease by 6.398 USD if one extra review is written about the listing. The coefficient is significant at p=0.05.
The variable prop_type_simplified is divided among five categories (Private room in rental unit, Entire rental unit, Entire condominium (condo), Entire serviced apartment and other). In model 1, entire condominium (condo) is chosen as the baseline variable. The coefficient of each property type, can be interpreted as the USD change if the property type changes from an entire condominium (condo) to the property type of the coefficient. Given the fact that none of the dummy variables for property types is significant, the property type does not appear to be a predictor of airbnb prices per 4 nights in this model
Next, we consider the type of room that was listed
model2 <- lm(price_4_nights ~factor(prop_type_simplified) +number_of_reviews + review_scores_rating + factor(room_type), data=reduced_listings)
msummary(model2) Estimate Std. Error
(Intercept) 3603.502 1202.796
factor(prop_type_simplified)Entire rental unit 1300.857 891.919
factor(prop_type_simplified)Entire serviced apartment 394.638 1528.577
factor(prop_type_simplified)Other 1599.089 1140.863
factor(prop_type_simplified)Private room in rental unit 929.918 1253.650
number_of_reviews -6.820 3.223
review_scores_rating 111.332 200.505
factor(room_type)Hotel room -2858.442 1316.354
factor(room_type)Private room -1766.373 878.149
factor(room_type)Shared room -3400.340 1112.623
t value Pr(>|t|)
(Intercept) 2.996 0.00276 **
factor(prop_type_simplified)Entire rental unit 1.458 0.14482
factor(prop_type_simplified)Entire serviced apartment 0.258 0.79629
factor(prop_type_simplified)Other 1.402 0.16114
factor(prop_type_simplified)Private room in rental unit 0.742 0.45829
number_of_reviews -2.116 0.03442 *
review_scores_rating 0.555 0.57876
factor(room_type)Hotel room -2.171 0.02998 *
factor(room_type)Private room -2.011 0.04437 *
factor(room_type)Shared room -3.056 0.00226 **
Residual standard error: 9553 on 2700 degrees of freedom
(2852 observations deleted due to missingness)
Multiple R-squared: 0.01266, Adjusted R-squared: 0.009372
F-statistic: 3.848 on 9 and 2700 DF, p-value: 7.437e-05
The base level is entire home/apartment, and the remaining categories are hotel room, private room, and shared room. Choosing a hotel room or private room is significant at the 5% level, and choosing a shared room is significant at the 1% level. The way to interpret these is that the price per 4 nights will decrease by USD 2858 for a hotel room compared to an entire home apartment in a condo. Similarly, the price for a private room is USD 1766 less compared to an entire home/apartment, and USD 3400 less for a shared room, which follows logic that shared rooms would be the cheapest.
For the remainder of this assignment, we will use the log_price_4_nights variable instead of price_4_nights. Since prop_type_simplified and number_of_reviews were insignificant, we will remove them from the next model. First, we will check whether the number of bathrooms, bedrooms, beds, or size of the house (accomodates) are significant predictors. We will also check whether these are co-linear variables. The reasoning for this is that in the correlation matrix, these seemed to have a significant correlation with the dependent variable of price.
# New model
model3 <- lm(log_price_4_nights ~ number_of_reviews + factor(room_type) + bathrooms + bedrooms + beds + accommodates, data=reduced_listings)
msummary(model3) Estimate Std. Error t value Pr(>|t|)
(Intercept) 7.5787487 0.0324557 233.511 < 2e-16 ***
number_of_reviews -0.0002478 0.0002430 -1.019 0.308
factor(room_type)Hotel room -0.9904779 0.0829418 -11.942 < 2e-16 ***
factor(room_type)Private room -0.8739274 0.0312877 -27.932 < 2e-16 ***
factor(room_type)Shared room -1.1078376 0.0565321 -19.597 < 2e-16 ***
bathrooms1 private bath 0.5777998 0.0348096 16.599 < 2e-16 ***
bathrooms1 shared bath -0.0340561 0.0353094 -0.965 0.335
bathrooms2 baths 0.2920691 0.0531295 5.497 4.07e-08 ***
bathroomsOther -0.0359973 0.0387997 -0.928 0.354
bedrooms 0.1291012 0.0178676 7.225 5.86e-13 ***
beds 0.0099620 0.0101364 0.983 0.326
accommodates 0.0728291 0.0067308 10.820 < 2e-16 ***
Residual standard error: 0.7338 on 4404 degrees of freedom
(1146 observations deleted due to missingness)
Multiple R-squared: 0.3832, Adjusted R-squared: 0.3817
F-statistic: 248.7 on 11 and 4404 DF, p-value: < 2.2e-16
# Check for co-linearity
car::vif(model3) GVIF Df GVIF^(1/(2*Df))
number_of_reviews 1.044338 1 1.021929
factor(room_type) 2.285178 3 1.147678
bathrooms 2.161502 4 1.101145
bedrooms 1.767274 1 1.329389
beds 2.273932 1 1.507956
accommodates 2.344080 1 1.531039
It appears that the bathrooms, bedrooms and accommodates are significant variables. Surprisingly, there is no co linearity between these variables (since the VIF is lower than 5). Also, adopting a log price model helps increase the explanatory power of the model to 38%. It also increases the significance of the variables. Given that the dependent variable is log transformed, the interpretation of coefficients changes. As before, the room type affects the price per 4 nights and having a shared room decreases the price. The new independent variables controls for the number of baths, beds, and the people that can be accommodated. There are a few noteworthy points
After removing the insignificant variables (number_of_reviews and beds), we will check whether superhosts (host_is_superhost) command a pricing premium
# New model
model4 <- lm(log_price_4_nights ~ factor(room_type) + bathrooms + bedrooms + accommodates +factor(host_is_superhost), data=reduced_listings)
msummary(model4) Estimate Std. Error t value Pr(>|t|)
(Intercept) 7.626631 0.031686 240.695 < 2e-16 ***
factor(room_type)Hotel room -0.984526 0.081533 -12.075 < 2e-16 ***
factor(room_type)Private room -0.820064 0.031032 -26.426 < 2e-16 ***
factor(room_type)Shared room -1.083387 0.052978 -20.450 < 2e-16 ***
bathrooms1 private bath 0.499003 0.034108 14.630 < 2e-16 ***
bathrooms1 shared bath -0.042505 0.034407 -1.235 0.217
bathrooms2 baths 0.324447 0.052277 6.206 5.91e-10 ***
bathroomsOther -0.039004 0.038025 -1.026 0.305
bedrooms 0.125056 0.016978 7.366 2.08e-13 ***
accommodates 0.073037 0.005579 13.090 < 2e-16 ***
factor(host_is_superhost)TRUE -0.238758 0.026637 -8.963 < 2e-16 ***
Residual standard error: 0.7245 on 4517 degrees of freedom
(1034 observations deleted due to missingness)
Multiple R-squared: 0.3891, Adjusted R-squared: 0.3877
F-statistic: 287.6 on 10 and 4517 DF, p-value: < 2.2e-16
# Check for co-linearity
car::vif(model4) GVIF Df GVIF^(1/(2*Df))
factor(room_type) 2.126925 3 1.134032
bathrooms 2.165278 4 1.101385
bedrooms 1.675540 1 1.294427
accommodates 1.672316 1 1.293180
factor(host_is_superhost) 1.082233 1 1.040304
The host_is_superhost variable is significant at p=0.01 and the other independent variable types are significant, like before. Remarkably, it appears that the log price per 4 nights decreases when a host is a superhost. Specifically, when the host is a superhost, the price per 4 nights decreases by 26%.
Some hosts allow you to immediately book their listing (instant_bookable == TRUE), while a non-trivial proportion don’t. After controlling for other variables, we will check whether instant_bookable a significant predictor
# New model
model5 <- lm(log_price_4_nights ~ factor(room_type) + bathrooms + bedrooms + accommodates +factor(host_is_superhost) + factor(instant_bookable), data=reduced_listings)
msummary(model5) Estimate Std. Error t value Pr(>|t|)
(Intercept) 7.562731 0.031758 238.136 < 2e-16 ***
factor(room_type)Hotel room -1.070046 0.080768 -13.248 < 2e-16 ***
factor(room_type)Private room -0.803659 0.030640 -26.229 < 2e-16 ***
factor(room_type)Shared room -1.105606 0.052287 -21.145 < 2e-16 ***
bathrooms1 private bath 0.437309 0.034079 12.832 < 2e-16 ***
bathrooms1 shared bath -0.023933 0.033974 -0.704 0.481
bathrooms2 baths 0.343148 0.051585 6.652 3.23e-11 ***
bathroomsOther -0.042205 0.037504 -1.125 0.260
bedrooms 0.121732 0.016748 7.269 4.26e-13 ***
accommodates 0.067756 0.005523 12.269 < 2e-16 ***
factor(host_is_superhost)TRUE -0.215427 0.026352 -8.175 3.81e-16 ***
factor(instant_bookable)TRUE 0.279986 0.024776 11.301 < 2e-16 ***
Residual standard error: 0.7145 on 4516 degrees of freedom
(1034 observations deleted due to missingness)
Multiple R-squared: 0.4059, Adjusted R-squared: 0.4044
F-statistic: 280.4 on 11 and 4516 DF, p-value: < 2.2e-16
# Check for co-linearity
car::vif(model5) GVIF Df GVIF^(1/(2*Df))
factor(room_type) 2.168170 3 1.137668
bathrooms 2.268737 4 1.107830
bedrooms 1.676057 1 1.294626
accommodates 1.684376 1 1.297835
factor(host_is_superhost) 1.088916 1 1.043511
factor(instant_bookable) 1.105441 1 1.051399
The instant_bookable variable is significant at p=0.01. It appears that a price premium is paid if a listing is instantly bookable. This follows, as the convenience of being able to book instantly would push people to be willing to spend higher. The model also now explains 40% of the data. The way to interpret the coefficient on a property being bookable is that for an entire apartment/house increases the price by 31% as opposed to a property that is not instantly bookable.
Next, we will inspect the impact of the neighborhood. For this, we have the listings in Hong Kong into five areas. Moreover, we will investigate important amenities, host identity verification, host acceptance rate, host listings and number of reviews
# New model
model6 <- lm(log_price_4_nights ~ factor(room_type) + bathrooms + bedrooms + accommodates+ factor(instant_bookable) + neighbourhood_categorical + important_amenities + factor(host_identity_verified) + host_acceptance_rate + calculated_host_listings_count + number_of_reviews, data=reduced_listings)
msummary(model6) Estimate Std. Error t value Pr(>|t|)
(Intercept) 7.3892700 0.0871529 84.785 < 2e-16
factor(room_type)Hotel room -0.9328048 0.0850231 -10.971 < 2e-16
factor(room_type)Private room -0.6486390 0.0355496 -18.246 < 2e-16
factor(room_type)Shared room -0.8817947 0.0576430 -15.298 < 2e-16
bathrooms1 private bath 0.3832317 0.0393760 9.733 < 2e-16
bathrooms1 shared bath 0.0261429 0.0338742 0.772 0.440309
bathrooms2 baths 0.4057886 0.0512915 7.911 3.44e-15
bathroomsOther -0.0730408 0.0393916 -1.854 0.063796
bedrooms 0.1053971 0.0178175 5.915 3.65e-09
accommodates 0.0536096 0.0061368 8.736 < 2e-16
factor(instant_bookable)TRUE 0.2092476 0.0314237 6.659 3.22e-11
neighbourhood_categoricalIslands 0.2055465 0.0583848 3.521 0.000436
neighbourhood_categoricalOther -0.1148559 0.0385546 -2.979 0.002912
neighbourhood_categoricalWan Chai -0.1447215 0.0358287 -4.039 5.48e-05
neighbourhood_categoricalYau Tsim Mong -0.1547758 0.0366908 -4.218 2.53e-05
important_amenities 0.1328478 0.0232778 5.707 1.25e-08
factor(host_identity_verified)TRUE 0.1684831 0.0272263 6.188 6.83e-10
host_acceptance_rate -0.0018740 0.0004422 -4.238 2.32e-05
calculated_host_listings_count -0.0011920 0.0001306 -9.128 < 2e-16
number_of_reviews -0.0010516 0.0002336 -4.502 6.95e-06
(Intercept) ***
factor(room_type)Hotel room ***
factor(room_type)Private room ***
factor(room_type)Shared room ***
bathrooms1 private bath ***
bathrooms1 shared bath
bathrooms2 baths ***
bathroomsOther .
bedrooms ***
accommodates ***
factor(instant_bookable)TRUE ***
neighbourhood_categoricalIslands ***
neighbourhood_categoricalOther **
neighbourhood_categoricalWan Chai ***
neighbourhood_categoricalYau Tsim Mong ***
important_amenities ***
factor(host_identity_verified)TRUE ***
host_acceptance_rate ***
calculated_host_listings_count ***
number_of_reviews ***
Residual standard error: 0.622 on 3321 degrees of freedom
(2221 observations deleted due to missingness)
Multiple R-squared: 0.5395, Adjusted R-squared: 0.5369
F-statistic: 204.8 on 19 and 3321 DF, p-value: < 2.2e-16
# Check for co-linearity
car::vif(model6) GVIF Df GVIF^(1/(2*Df))
factor(room_type) 2.926549 3 1.195986
bathrooms 3.108303 4 1.152300
bedrooms 1.807903 1 1.344583
accommodates 1.976054 1 1.405722
factor(instant_bookable) 1.577734 1 1.256079
neighbourhood_categorical 1.758945 4 1.073140
important_amenities 1.730228 1 1.315381
factor(host_identity_verified) 1.595662 1 1.263195
host_acceptance_rate 1.446151 1 1.202560
calculated_host_listings_count 3.448548 1 1.857027
number_of_reviews 1.173653 1 1.083353
Overall, the neighborhood has a significant impact on the price per 4 nights. The new model, which controls for whether or not the property has important amenities, the number of host listings, and the type of neighborhood, explains 53% of the variability in data. The new independent variables are all significant at the 1% level. The base neighborhood is Central & Western and the interpretation on the coefficients is:
Next, we will incorporate the effect of avalability_30 or reviews_per_month on log_price_4_nights
# New model
model7 <- lm(log_price_4_nights ~ factor(room_type) + bathrooms + bedrooms + accommodates +factor(host_is_superhost) + factor(instant_bookable) + neighbourhood_categorical + important_amenities + factor(host_identity_verified) + host_acceptance_rate + calculated_host_listings_count + availability_30 + reviews_per_month, data=reduced_listings)
msummary(model7) Estimate Std. Error t value Pr(>|t|)
(Intercept) 7.2867542 0.1511163 48.220 < 2e-16
factor(room_type)Hotel room -1.0009688 0.1130183 -8.857 < 2e-16
factor(room_type)Private room -0.6131677 0.0614492 -9.978 < 2e-16
factor(room_type)Shared room -1.4889174 0.1091533 -13.641 < 2e-16
bathrooms1 private bath 0.4475541 0.0613657 7.293 4.83e-13
bathrooms1 shared bath 0.0141115 0.0748819 0.188 0.850549
bathrooms2 baths 0.4001954 0.0845549 4.733 2.42e-06
bathroomsOther -0.0377981 0.0628356 -0.602 0.547569
bedrooms 0.0836681 0.0291892 2.866 0.004208
accommodates 0.0772091 0.0098076 7.872 6.53e-15
factor(host_is_superhost)TRUE -0.0043928 0.0459171 -0.096 0.923796
factor(instant_bookable)TRUE 0.2733612 0.0421635 6.483 1.21e-10
neighbourhood_categoricalIslands 0.2006377 0.0859812 2.334 0.019750
neighbourhood_categoricalOther -0.2016518 0.0717878 -2.809 0.005033
neighbourhood_categoricalWan Chai -0.1864330 0.0801457 -2.326 0.020139
neighbourhood_categoricalYau Tsim Mong -0.2867385 0.0677304 -4.234 2.44e-05
important_amenities 0.1511906 0.0421489 3.587 0.000345
factor(host_identity_verified)TRUE 0.2692909 0.0430306 6.258 5.04e-10
host_acceptance_rate -0.0027050 0.0006604 -4.096 4.43e-05
calculated_host_listings_count -0.0006289 0.0003103 -2.027 0.042855
availability_30 0.0017441 0.0014903 1.170 0.242046
reviews_per_month -0.0276561 0.0115593 -2.393 0.016852
(Intercept) ***
factor(room_type)Hotel room ***
factor(room_type)Private room ***
factor(room_type)Shared room ***
bathrooms1 private bath ***
bathrooms1 shared bath
bathrooms2 baths ***
bathroomsOther
bedrooms **
accommodates ***
factor(host_is_superhost)TRUE
factor(instant_bookable)TRUE ***
neighbourhood_categoricalIslands *
neighbourhood_categoricalOther **
neighbourhood_categoricalWan Chai *
neighbourhood_categoricalYau Tsim Mong ***
important_amenities ***
factor(host_identity_verified)TRUE ***
host_acceptance_rate ***
calculated_host_listings_count *
availability_30
reviews_per_month *
Residual standard error: 0.7091 on 1535 degrees of freedom
(4005 observations deleted due to missingness)
Multiple R-squared: 0.4329, Adjusted R-squared: 0.4251
F-statistic: 55.79 on 21 and 1535 DF, p-value: < 2.2e-16
# Check for co-linearity
car::vif(model7) GVIF Df GVIF^(1/(2*Df))
factor(room_type) 4.594357 3 1.289350
bathrooms 3.939700 4 1.186951
bedrooms 2.043575 1 1.429537
accommodates 2.136886 1 1.461809
factor(host_is_superhost) 1.142213 1 1.068743
factor(instant_bookable) 1.324012 1 1.150657
neighbourhood_categorical 1.841446 4 1.079307
important_amenities 1.368281 1 1.169736
factor(host_identity_verified) 1.176545 1 1.084686
host_acceptance_rate 1.339702 1 1.157455
calculated_host_listings_count 1.692133 1 1.300820
availability_30 1.177546 1 1.085148
reviews_per_month 1.185278 1 1.088705
Overall, it appears that availability_30 is not a predictor of log prices, since it is insignificant. For the remainder of this project, we will use model 6 as our best model because of the adjusted R2 and significance of variables.
To check the robustness of our established model, model6, we run a series of diagnostic tests.
autoplot(model6) Since the QQ plot shows that the residuals don’t follow a normal distribution, we cannot use msummary to establish the interpretation of coefficients as we need a more robust estimator for our t-tests and confidence intervals. However, OLS is still the BLUE. To double check this, we run a Shapiro W Test and a Breusch - Pagan t test. The null hypothesis for the shapiro test is that the residuals follow a normal distribution and the null for the Breusch Pagan is that the data is homoskedastic.
shapiro.test(model6$residuals)
Shapiro-Wilk normality test
data: model6$residuals
W = 0.83712, p-value < 2.2e-16
bptest(model6)
studentized Breusch-Pagan test
data: model6
BP = 231.07, df = 19, p-value < 2.2e-16
We reject the null hypothesis of normality at the 1% level and reject the null hypothesis of homoskedasticity at the 1% level . Thus, to get robust estimates of the coefficients, we use a robust t test. We selected the HC0 type as the other types are more effective for small samples.
coeftest(model6, vcov = vcovHC(model6, type = "HC0"))
t test of coefficients:
Estimate Std. Error t value
(Intercept) 7.38927000 0.08760343 84.3491
factor(room_type)Hotel room -0.93280477 0.08716610 -10.7015
factor(room_type)Private room -0.64863901 0.03754449 -17.2765
factor(room_type)Shared room -0.88179474 0.06682907 -13.1948
bathrooms1 private bath 0.38323174 0.04362637 8.7844
bathrooms1 shared bath 0.02614289 0.02371966 1.1022
bathrooms2 baths 0.40578862 0.05211780 7.7860
bathroomsOther -0.07304076 0.04174750 -1.7496
bedrooms 0.10539706 0.03106102 3.3932
accommodates 0.05360962 0.00771786 6.9462
factor(instant_bookable)TRUE 0.20924761 0.04132860 5.0630
neighbourhood_categoricalIslands 0.20554654 0.04890183 4.2032
neighbourhood_categoricalOther -0.11485588 0.03481534 -3.2990
neighbourhood_categoricalWan Chai -0.14472146 0.02500214 -5.7884
neighbourhood_categoricalYau Tsim Mong -0.15477582 0.03073294 -5.0362
important_amenities 0.13284784 0.02104873 6.3114
factor(host_identity_verified)TRUE 0.16848315 0.02910709 5.7884
host_acceptance_rate -0.00187403 0.00043929 -4.2660
calculated_host_listings_count -0.00119198 0.00013456 -8.8582
number_of_reviews -0.00105163 0.00022631 -4.6469
Pr(>|t|)
(Intercept) < 2.2e-16 ***
factor(room_type)Hotel room < 2.2e-16 ***
factor(room_type)Private room < 2.2e-16 ***
factor(room_type)Shared room < 2.2e-16 ***
bathrooms1 private bath < 2.2e-16 ***
bathrooms1 shared bath 0.2704715
bathrooms2 baths 9.173e-15 ***
bathroomsOther 0.0802825 .
bedrooms 0.0006988 ***
accommodates 4.497e-12 ***
factor(instant_bookable)TRUE 4.351e-07 ***
neighbourhood_categoricalIslands 2.700e-05 ***
neighbourhood_categoricalOther 0.0009805 ***
neighbourhood_categoricalWan Chai 7.768e-09 ***
neighbourhood_categoricalYau Tsim Mong 5.003e-07 ***
important_amenities 3.131e-10 ***
factor(host_identity_verified)TRUE 7.767e-09 ***
host_acceptance_rate 2.045e-05 ***
calculated_host_listings_count < 2.2e-16 ***
number_of_reviews 3.500e-06 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
To correct for the heteroskedasticity, we use a WLS adjustment on model 6
# heteroskedasticity is present, use WLS
wt <- 1/lm(abs(model6$residuals)~model6$fitted.values, na.action = "na.exclude")$fitted.values^2
wls_model_6 <- lm(log_price_4_nights ~ factor(room_type) + bathrooms + bedrooms + accommodates +factor(host_is_superhost) + factor(instant_bookable) + neighbourhood_categorical, data=reduced_listings)
skim(reduced_listings$log_price_4_nights)| Name | reduced_listings$log_pric… |
| Number of rows | 5562 |
| Number of columns | 1 |
| _______________________ | |
| Column type frequency: | |
| numeric | 1 |
| ________________________ | |
| Group variables | None |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| data | 0 | 1 | 7.46 | 0.89 | 1.39 | 6.67 | 7.33 | 7.92 | 12.73 | <U+2581><U+2581><U+2587><U+2582><U+2581> |
which(is.na(reduced_listings$log_price_4_nights))integer(0)
skim(model6$residuals)| Name | model6$residuals |
| Number of rows | 3341 |
| Number of columns | 1 |
| _______________________ | |
| Column type frequency: | |
| numeric | 1 |
| ________________________ | |
| Group variables | None |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| data | 0 | 1 | 0 | 0.62 | -2.21 | -0.33 | -0.06 | 0.19 | 5.61 | <U+2581><U+2587><U+2581><U+2581><U+2581> |
The QQplot shows that the non-normality has been corrected, although there is still the presence of large outliers with high leverage.
autoplot(wls_model_6)#Interaction between variables
coplot(log_price_4_nights ~ accommodates | neighbourhood_categorical * important_amenities,data = reduced_listings) We can see from the co-plot the varying distributions across different neighbourhoods. We can see that across the plots that Yau Tsim Mong tends to have more high accommodation airbnbs. We also see a spike in higher price airbnbs in the Other and Islands neighbourhood. For the amenities, we see that there are generally more high amenity airbnbs across the neighbourhoods and an increase in number of larger places as amenities increases.
Now, to compare all the models we have
huxreg(list("Model 1" = model1,"Model 2" = model2,"Model 3" = model3,"Model 4" = model4,"Model 5" = model5, "Model 6" = model6, "Model 7" = model7),
statistics = c('R squared' = 'r.squared',
'Adj. R Squared' = 'adj.r.squared',
'Residual SE' = 'sigma'),
bold_signif = 0.05
) %>%
set_caption('Comparison of models')| Model 1 | Model 2 | Model 3 | Model 4 | Model 5 | Model 6 | Model 7 | |
|---|---|---|---|---|---|---|---|
| (Intercept) | 3609.492 ** | 3603.502 ** | 7.579 *** | 7.627 *** | 7.563 *** | 7.389 *** | 7.287 *** |
| (1202.609) | (1202.796) | (0.032) | (0.032) | (0.032) | (0.087) | (0.151) | |
| factor(prop_type_simplified)Entire rental unit | 1301.983 | 1300.857 | |||||
| (893.123) | (891.919) | ||||||
| factor(prop_type_simplified)Entire serviced apartment | 394.116 | 394.638 | |||||
| (1530.659) | (1528.577) | ||||||
| factor(prop_type_simplified)Other | -242.214 | 1599.089 | |||||
| (876.853) | (1140.863) | ||||||
| factor(prop_type_simplified)Private room in rental unit | -836.183 | 929.918 | |||||
| (898.410) | (1253.650) | ||||||
| number_of_reviews | -6.880 * | -6.820 * | -0.000 | -0.001 *** | |||
| (3.217) | (3.223) | (0.000) | (0.000) | ||||
| review_scores_rating | 110.317 | 111.332 | |||||
| (200.145) | (200.505) | ||||||
| factor(room_type)Hotel room | -2858.442 * | -0.990 *** | -0.985 *** | -1.070 *** | -0.933 *** | -1.001 *** | |
| (1316.354) | (0.083) | (0.082) | (0.081) | (0.085) | (0.113) | ||
| factor(room_type)Private room | -1766.373 * | -0.874 *** | -0.820 *** | -0.804 *** | -0.649 *** | -0.613 *** | |
| (878.149) | (0.031) | (0.031) | (0.031) | (0.036) | (0.061) | ||
| factor(room_type)Shared room | -3400.340 ** | -1.108 *** | -1.083 *** | -1.106 *** | -0.882 *** | -1.489 *** | |
| (1112.623) | (0.057) | (0.053) | (0.052) | (0.058) | (0.109) | ||
| bathrooms1 private bath | 0.578 *** | 0.499 *** | 0.437 *** | 0.383 *** | 0.448 *** | ||
| (0.035) | (0.034) | (0.034) | (0.039) | (0.061) | |||
| bathrooms1 shared bath | -0.034 | -0.043 | -0.024 | 0.026 | 0.014 | ||
| (0.035) | (0.034) | (0.034) | (0.034) | (0.075) | |||
| bathrooms2 baths | 0.292 *** | 0.324 *** | 0.343 *** | 0.406 *** | 0.400 *** | ||
| (0.053) | (0.052) | (0.052) | (0.051) | (0.085) | |||
| bathroomsOther | -0.036 | -0.039 | -0.042 | -0.073 | -0.038 | ||
| (0.039) | (0.038) | (0.038) | (0.039) | (0.063) | |||
| bedrooms | 0.129 *** | 0.125 *** | 0.122 *** | 0.105 *** | 0.084 ** | ||
| (0.018) | (0.017) | (0.017) | (0.018) | (0.029) | |||
| beds | 0.010 | ||||||
| (0.010) | |||||||
| accommodates | 0.073 *** | 0.073 *** | 0.068 *** | 0.054 *** | 0.077 *** | ||
| (0.007) | (0.006) | (0.006) | (0.006) | (0.010) | |||
| factor(host_is_superhost)TRUE | -0.239 *** | -0.215 *** | -0.004 | ||||
| (0.027) | (0.026) | (0.046) | |||||
| factor(instant_bookable)TRUE | 0.280 *** | 0.209 *** | 0.273 *** | ||||
| (0.025) | (0.031) | (0.042) | |||||
| neighbourhood_categoricalIslands | 0.206 *** | 0.201 * | |||||
| (0.058) | (0.086) | ||||||
| neighbourhood_categoricalOther | -0.115 ** | -0.202 ** | |||||
| (0.039) | (0.072) | ||||||
| neighbourhood_categoricalWan Chai | -0.145 *** | -0.186 * | |||||
| (0.036) | (0.080) | ||||||
| neighbourhood_categoricalYau Tsim Mong | -0.155 *** | -0.287 *** | |||||
| (0.037) | (0.068) | ||||||
| important_amenities | 0.133 *** | 0.151 *** | |||||
| (0.023) | (0.042) | ||||||
| factor(host_identity_verified)TRUE | 0.168 *** | 0.269 *** | |||||
| (0.027) | (0.043) | ||||||
| host_acceptance_rate | -0.002 *** | -0.003 *** | |||||
| (0.000) | (0.001) | ||||||
| calculated_host_listings_count | -0.001 *** | -0.001 * | |||||
| (0.000) | (0.000) | ||||||
| availability_30 | 0.002 | ||||||
| (0.001) | |||||||
| reviews_per_month | -0.028 * | ||||||
| (0.012) | |||||||
| R squared | 0.009 | 0.013 | 0.383 | 0.389 | 0.406 | 0.539 | 0.433 |
| Adj. R Squared | 0.007 | 0.009 | 0.382 | 0.388 | 0.404 | 0.537 | 0.425 |
| Residual SE | 9565.743 | 9552.724 | 0.734 | 0.724 | 0.715 | 0.622 | 0.709 |
| *** p < 0.001; ** p < 0.01; * p < 0.05. | |||||||
#Testing the model
set.seed(123)
train_test_split <- initial_split(reduced_listings, prop = 0.75)
listings_train <- training(train_test_split)
listings_test <- testing(train_test_split)
rmse_train <- listings_train %>%
mutate(predictions = predict(model6, .)) %>%
summarise(sqrt(sum(predictions - log_price_4_nights, na.rm = TRUE) ^ 2/n())) %>%
pull()
rmse_train[1] 0.06453211
rmse_test <- listings_test %>%
mutate(predictions = predict(model6, .)) %>%
summarise(sqrt(sum(predictions - log_price_4_nights, na.rm = TRUE) ^ 2/n())) %>%
pull()
rmse_test[1] 0.1117461
Given that the RMSE for the training and testing model have a small margin of error, it proves that model6 isn’t overfitting the data
unique(reduced_listings$bathrooms)[1] "1 bath" "Other" "2 baths" "1 private bath"
[5] "1 shared bath"
testingairbnb = data.frame(room_type = c("Private room","Private room","Private room","Private room","Private room"),bathrooms = c("1 private bath","Other","1 private bath","1 private bath","2 baths"), bedrooms = c(1,2,1,1,2), accommodates = c(2,6,2,2,5), instant_bookable = c(TRUE,FALSE,TRUE,TRUE,TRUE),important_amenities = c(3,2,3,3,3), host_identity_verified=c(TRUE,TRUE,TRUE,TRUE,TRUE),calculated_host_listings_count = c(5,5,3,28,1), number_of_reviews = c(100,31,10,40,21),host_acceptance_rate = c(100,50,100,25,25), neighbourhood_categorical = c("Yau Tsim Mong","Islands","Wan Chai","Other","Other"))
exp(predict(model6, newdata = testingairbnb, interval = "confidence"))/4 fit lwr upr
1 530.1486 495.9108 566.7502
2 556.2463 490.1354 631.2744
3 590.0706 542.3562 641.9826
4 658.0704 591.5738 732.0417
5 925.4304 803.6984 1065.6005
We can use the above data to predict the price for 4 nights based on different parameters. The cheapest type of room would be a private room, with a private bath, one bedroom, that accommodates 2, in Yau Tsim Mong. The property is instantly bookable, has 3 important amenities, with a verified host who has 5 listings, 100 reviews, and accepts bookings 100% of the time. The price per night for this type of property is USD530, which means the price for 4 nights is USD2120.
Conversely, the most expensive accommodation is a private room with 2 baths, 2 bedrooms, that accommodates 5, is instantly bookable with 3 important amenities located in a neighbourhood outside the most popular 3 (Yau Tsim Mong, Islands, Wan Chai). The host’s identity is verified, has 1 listing, has received 21 reviews, and accepts bookings 25% of the time. The price per night is USD 925, and the price per 4 nights is USD3700
When compared to the official prices as listed on AirBnB, the property prices per night are USD 514, 617, 411, 1554, and 1029 respectively. We can see that the prices for the properties are skewed towards the higher end of the CI, which could be explained by the heteroskedasticity of the data as well as the fact that COVID might have had an impact on the data used in the model, whereas latest prices have significantly changed since then.
In addition, as our model was trained with middle to high-end data, the only properties this has predictive power for is other middle to high-end properties as we can not extrapolate beyond the scope of our data. The 25th percentile of our prices is around 250 USD per night, so our predictive power decreases for low end airbnbs. The same is true with ultra-high end properties.
```